B
    .(bM+                 @   s  d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZmZ d dlmZmZ d dlmZ ddlmZ dd Z dd Z!dd Z"dd Z#dd Z$dd Z%dd Z&ej'(dddgd d! Z)d"d# Z*d$d% Z+d&d' Z,d(d) Z-d*d+ Z.d,d- Z/d.d/ Z0d0d1 Z1d2d3 Z2ej'(d4d5gd6d7 Z3d8d9 Z4d:d; Z5d<d= Z6ej'(d>d?gd@dA Z7dBdC Z8dDdE Z9dFdG Z:dHdI Z;dS )J    )unicode_literalsN)English)
STOP_WORDS)is_stop)Vectors)Vocab)Language)DocSpanToken)TaggerEntityRecognizer)HEADDEP)Matcher   )make_tempdirc              C   sh   dd } t  }xTt||  D ]@\}}|dks@|dks@|dkrHt  x|D ]}t|j qNW q W d S )Nc              s   s|   xt dD ]
} dV  q
W xt dD ]
} dV  q"W xt dD ]
} dV  q:W xt dD ]
} dV  qRW xt dD ]
} dV  qjW d S )Ni'  z#It's sentence produced by that bug.zI erase some hbdsaj lemmas.zI erase lemmas.)range)_ r   Y/home/dcms/DCMS/lib/python3.7/site-packages/spacy/tests/regression/test_issue1501-2000.pystring_generator   s    



z(test_issue1506.<locals>.string_generatori'  i N  i0u  )r   	enumeratepipegcZcollectstrlemma_)r   nlpidtr   r   r   test_issue1506   s    
r!   c              C   s&   t dd} | jddd | d dS )zTest vectors.resize() works.)
   r"   )shapehellor   )row)   	   N)r   addresize)vectorsr   r   r   test_issue1518-   s    
r+   c              C   s   d} t t |  d}d|d _x2|dd D ]"}|djdkrJd|_q.d	|_q.W t|j}|d  }|d  }t	|t st
t	|t st
dS )
z)Test that Span.as_doc() doesn't segfault.z7The sky is blue . The man is pink . The dog is purple .)wordsTr      N.F)r	   r   split
sent_startZnbortextlistsentsZas_doc
isinstanceAssertionError)stringdocwordr4   Zsent0Zsent1r   r   r   test_issue15374   s    


r:   c              C   s"   t dddddgd} | d dS )	zJEnsure vectors.resize() doesn't try to modify dictionary during iteration.)r"   r"   r&      b   d   )r#   keys)r=   r=   N)r   r)   )vr   r   r   test_issue1539N   s    r@   c           
   C   s   ddddddddddg
} t t | d	}t|d
d|jjd dg|_| }||dd  W dQ R X dd |jD s|tdS )z9Test that entity labels still match after merging tokens.
Zwordar/   Zwordb-Z	Biosphere2z 
)r,         ZPRODUCT)labelr&      Nc             S   s   g | ]
}|j qS r   )r2   ).0entr   r   r   
<listcomp>[   s    z"test_issue1547.<locals>.<listcomp>)	r	   r   r
   vocabstringsZents
retokenizemerger6   )r,   r8   retokenizerr   r   r   test_issue1547T   s    
rP   c             C   s(   | d}|dd }|j |jks$td S )NzThe black cat purrs.r-   r;   )orth_r2   r6   )en_tokenizerr8   spanr   r   r   test_issue1612^   s    rT   c              C   s   t t } | jrt| jdd dd | jdd ddd | jdd d	dd | jddd	gksbtt t }|jrvt|jd
d d	d |jdd dd	d |jdd ddd |jddd	gkstd S )Nc             S   s   | S )Nr   )r8   r   r   r   <lambda>g       z test_issue1654.<locals>.<lambda>1)namec             S   s   | S )Nr   )r8   r   r   r   rU   h   rV   rC   )rX   afterc             S   s   | S )Nr   )r8   r   r   r   rU   i   rV   3c             S   s   | S )Nr   )r8   r   r   r   rU   m   rV   c             S   s   | S )Nr   )r8   r   r   r   rU   n   rV   )rX   beforec             S   s   | S )Nr   )r8   r   r   r   rU   o   rV   )r   r   Zpipeliner6   add_pipeZ
pipe_names)r   Znlp2r   r   r   test_issue1654d   s    



r]   r2   ztest@example.comzjohn.doe@example.co.ukc             C   s*   | |}t |dkst|d jr&td S )Nr-   r   )lenr6   Zlike_url)rR   r2   r8   r   r   r   test_issue1698s   s    r_   c           	   C   s   t jddd} t| dddgd}tt }|d tt |	  W d	Q R X |j
d
ddksht||j_t 6}|| tt |}|j
d
ddkstW d	Q R X d	S )zfTest that models with no pretrained vectors can be deserialized
    correctly after vectors are added.)r;   i,  f)dtypeIamZMatt)datar>   ZPRPNZpretrained_dimsr   )numpyonesr   r   r   	add_labelpytestZwarnsUserWarningbegin_trainingcfggetr6   rK   r*   r   Zto_diskZ	from_disk)rd   r*   Ztaggerpathr   r   r   test_issue1727z   s    


rn   c              C   s   t t dddgd} | d dk r$t| d dk	s4t| d dksDt| dd dk rXt| dd dk	slt| dd dkst| jd dk	st| jd dk rtdS )z4Test comparison against None doesn't cause segfault.abc)r,   r   Nr   )r	   r   r6   rK   )r8   r   r   r   test_issue1757   s    rr   c             C   s@   | d}t |dkst|d jdks*t|d jdks<tdS )zDTest that "would've" is handled by the English tokenizer exceptions.zwould'ver   r   ZMDr-   ZhaveN)r^   r6   tag_r   )rR   tokensr   r   r   test_issue1758   s    ru   c             C   s,   | d}|d j dkr(|d jdks(tdS )zyTest that spaces don't receive a POS but no TAG. This is the root cause
    of the serialization issue reported in #1773.rA   r   SPACE N)Zpos_rs   r6   )rR   r8   r   r   r   test_issue1773   s    rx   c           	   C   s   t jddgddgddgddgdd	gd
dgddggdd} tt d d}|jjd |t	t
g| }tt|jdks~tdS )z[Test sentence boundaries are deserialized correctly, even for
    non-projective sentences.r-   i     i  r   i  i  r   l   LP^& l    i  l    i  Zuint64)ra   zJust what I was looking for .)r,   ROOTN)re   Zasarrayr	   r   r0   rK   rL   r(   Z
from_arrayr   r   r^   r3   r4   r6   )Z
heads_depsr8   r   r   r   test_issue1799   s    r{   c              C   s<   t dd} d| kst| dtjddd d| ks8tdS )z6Test vocab.set_vector also adds the word to the vocab.test_issue1807)Zvectors_namer$   )2   r`   )ra   N)r   r6   Z
set_vectorre   rf   )rK   r   r   r   r|      s    
r|   c              C   s   d} t t |  d}d|d _t |j| }|d jsBt|jrLt|j	rVtd|_d|_	t |j| }|jst|j	stdS )zVTest that sentence boundaries & parse/tag flags are not lost
    during serialization.z*This is a first sentence . And another one)r,   TrD   N)
r	   r   r0   r1   rK   
from_bytesto_bytesr6   Z	is_parsedZ	is_tagged)r7   r8   new_docr   r   r   test_issue1834   s    



r   c              C   sR   t  } | d }|j| kst|j| ks*td| ks6t| jd}|| ksNtdS )z,Test Vocab.__contains__ works with int keys.r$   zsome stringN)r   orthr6   rQ   rL   r(   )rK   lexZint_idr   r   r   test_issue1868   s    r   c              C   st   t t } | dddigg t| jdgd}t| |dksBtt| }t|jdgd}t||dksptd S )NZpat1r   r$   )r,   r-   )	r   r   r(   r	   rK   r^   r6   copydeepcopy)matcherr8   Znew_matcherr   r   r   r   test_issue1883   s    

r   r9   Zthec             C   s    t | tt |  tkstd S )N)r   r   upperr6   )r9   r   r   r   test_issue1889   s    r   c           	   C   sT   ddi} t  }||d |dd tt |jf |  W d Q R X d S )NZhidden_depthr   nerZanswer)	r   r\   Zcreate_pipeZget_piperg   rh   Zraises
ValueErrorrj   )rk   r   r   r   r   test_issue1915   s    r   c              C   s   t t } | dddiddigg t| jdddgd}| |}t|dksPt|d dd d	ksht|d dd d
kstdS )z0Test regression in Matcher introduced in v2.0.6.ZMWEr   ro   )r,   r   r   r-   N)r   r   )r-   r;   )r   r   r(   r	   rK   r^   r6   )r   r8   matchesr   r   r   test_issue1945   s    
r   c          	   C   sj   | d}t jt|dfdd|_| }||dd  W dQ R X t|dksVt|jjd	ksftdS )
z(Test that doc.merge() resizes doc.tensorza b c d   r`   )ra   r   r   Nr;   )r;   r   )re   rf   r^   ZtensorrM   rN   r6   r#   )rR   r8   rO   r   r   r   test_issue1963  s    
r   rF   z
U-JOB-NAMEc             C   sH   t t }dgdgdgdgdg| gf}d |d fgfg}|jj|d d S )Nr   r9   tagdep)gold_parses)r   r   movesZget_actions)rF   r   entryr   r   r   r   test_issue1967  s    
r   c                s   t  }ddidddddidddddg}tjdd	d
 |d|g t ddddgd}||}t fdd|D std S )NORTHZDoe!?)r   OPoptionalT)r   r   F)defaultTESTZHelloZJohn)r,   c                s   g | ]\}}}| j kqS r   )rL   )rH   Zmatch_idstartend)en_vocabr   r   rJ   &  s    z"test_issue1971.<locals>.<listcomp>)r   r   set_extensionr(   r	   allr6   )r   r   patternr8   r   r   )r   r   test_issue1971  s    r   c             C   sr   t | }dddgidddig}ddiddig}t| ddd	ddgd
}|d||g ||}t|dksntd S )NZEURINZeur)r   ZLOWERZLIKE_NUMTr   10is)r,   ZTEST1r   )r   r	   r(   r^   r6   )r   r   Zpattern1Zpattern2r8   r   r   r   r   test_issue_1971_2)  s    r   c                s   t jdddd t jdddd t ddgd	}t }|d
dddiigg |ddddiigg t fdd||D }t|dkst|tddddgkstdS )zFTest that pattern matches correctly for multiple extension attributes.ro   r-   T)r   forcerp   r   r$   Zworld)r,   Ar   Bc             3   s$   | ]\}}} j | ||fV  qd S )N)rL   )rH   Zm_idse)r   r   r   	<genexpr>;  s    z$test_issue_1971_3.<locals>.<genexpr>ry   )r   r   r-   )r   r-   r   )r   r   r-   )r   r-   r   N)r   r   r	   r   r(   sortedr^   r6   )r   r8   r   r   r   )r   r   test_issue_1971_33  s    r   c             C   s   t jdddd t jdddd t| }t| ddd	gd
}ddddigd }|d|g ||}t|dkstt|d | jd ddfkstdS )zhTest that pattern matches correctly with multiple extension attribute
    values on a single token.
    ext_aZstr_aT)r   r   ext_bZstr_bthisr   r2   )r,   r   )r   r   r;   r   r-   r   N)r   r   r   r	   r(   r^   r6   rL   )r   r   r8   r   r   r   r   r   test_issue_1971_4@  s    r   )<
__future__r   rh   r   re   r   Zspacy.lang.enr   Zspacy.lang.en.stop_wordsr   Zspacy.lang.lex_attrsr   Zspacy.vectorsr   Zspacy.vocabr   Zspacy.languager   Zspacy.tokensr	   r
   r   Zspacy.pipeliner   r   Zspacy.attrsr   r   Zspacy.matcherr   utilr   r!   r+   r:   r@   rP   rT   r]   markZparametrizer_   rn   rr   ru   rx   r{   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sR   

	

