B
    .(bh=                 @   s  d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ ddlmZmZ ejdddigddiddiggddiddigddigggdd Z ejdddigddiddiggddiddigddigggdd Z!dd Z"dd Z#dd Z$dd Z%ejddgd d! Z&d"d# Z'd$d% Z(ejj)d&d' Z*d(d) Z+d*d+ Z,d,d- Z-d.d/ Z.d0d1 Z/ejd2d3d4gd5d6 Z0ejd7d8d9gd:d; Z1d<d= Z2ejd7d>d?gd@dA Z3ejdBdCdDdEgdFdG Z4ejd7dHdIdJdKgdLdM Z5ejd7dNdNgdOdP Z6ejd7dQdRgdSdT Z7ejj)ejdUdVdWdXdYdZgfd[d\d]d^gfd_d`dadbgfdcdddedfgfdgdhdidjgfdkdldmdngfdodpdqdmdrgfgdsdt Z8ejdudvdwdxdydzd{gfd|d}d~ddddddgfgdd Z9dd Z:dd Z;ejj<ddejd7ddddgdd Z=ejd7ddgdd Z>ejd7dgdd Z?ejd7dgdd Z@ejddddgdd ZAejjBdd ZCejj)dd ZDdS )    )unicode_literalsN)Matcher)IS_PUNCTORTHLOWER)POSVERBVerbForm_inf)Vocab)Language)
Lemmatizer)Lookups)DocSpan)EnglishDefaults   )get_docmake_tempdirpatternsr   ZcelticsZbostonc                s   d}| |}|j jd  t|j }|d| tt|jdksDt fdd||D }| ddf d	dfgksvt|d
d |_t|j}t|dkst|d j kst|d j	dkst|d j
dkstd
S )z5Test a bug that arose from having overlapping matcheszLhow many points did lebron james score against the boston celtics last nightORGBostonCelticsr   c                s   g | ]\}}} ||fqS  r   ).0_startend)r   r   V/home/dcms/DCMS/lib/python3.7/site-packages/spacy/tests/regression/test_issue1-1000.py
<listcomp>$   s    z!test_issue118.<locals>.<listcomp>	      
   N   )vocabstringsr   addlenlistentsAssertionErrorlabelr   r   )en_tokenizerr   textdocmatchermatchesr'   r   )r   r   test_issue118   s    


r/   c                s   d}| |}|j jd  t|j }|d| tt|jdksDt fdd||D }| jt|dd 7  _| d	d
f d	dfgkst|j}t|dkst|d j	 kst|d j
d	kst|d jdkstdS )z5Test a bug that arose from having overlapping matcheszLhow many points did lebron james score against the boston celtics last nightr   r   r   c                s   g | ]\}}} ||fqS r   r   )r   r   r   r   )r   r   r   r   ?   s    z0test_issue118_prefix_reorder.<locals>.<listcomp>r!   Nr   r    r   )r"   r#   r   r$   r%   r&   r'   r(   tupler)   r   r   )r*   r   r+   r,   r-   r.   r'   r   )r   r   test_issue118_prefix_reorder.   s    

r1   c          	   C   s   d}ddiddigddiddigg}| |}t |j}|d| dd ||D }|\}}|d	 d
kslt|d dks|t|d	 dkst|d dksttt | jt|7  _W dQ R X dS )z$Test overlapping multi-word phrases.zAThere are different food safety standards in different countries.r   ZfoodZsafetyZ	standardsZFOODc             S   s   g | ]\}}}|||fqS r   r   )r   Zent_typer   r   r   r   r   r   S   s    z!test_issue242.<locals>.<listcomp>r!      r            N)	r   r"   r$   r(   pytestraises
ValueErrorr'   r0   )r*   r+   r   r,   r-   r.   Zmatch1Zmatch2r   r   r   test_issue242I   s    
r9   c             C   s\   | d}t |jdd |D dgdgd}d|_t|dks>tt|j}t|dksXtd	S )
z*Test Issue #309: SBD fails on empty string c             S   s   g | ]
}|j qS r   )r+   )r   tr   r   r   r   c   s    z!test_issue309.<locals>.<listcomp>r   ROOT)wordsZheadsdepsTr!   N)r   r"   	is_parsedr%   r(   r&   sents)r*   tokensr,   r@   r   r   r   test_issue309_   s    
rB   c             C   sD   | d}|d j dkstt|d dks.t|d j dks@td S )Nz   This is a cat.r   r2   r!   )idxr(   r%   )r*   r,   r   r   r   test_issue351k   s    rD   c             C   s   | d}t |dkstdS )z!Test tokenization of big ellipsisz$45...............Askingr   N)r%   r(   )r*   rA   r   r   r   test_issue360r   s    rE   ztext1,text2)catdogc             C   s,   | | | | kst | | | | ks(t dS )z$Test Issue #361: Equality of lexemesN)r(   )en_vocabZtext1Ztext2r   r   r   test_issue361x   s    rI   c             C   s   | d}t |j}|dtditdigg ||}t|dksDt|dtditditditdigg ||}t|d	kst|d
tditditditdigg ||}t|d	kstdS )z6Test that Matcher doesn't segfault on particular inputza b; cZTEST1abr!   ZTEST2Tcr   ZTEST3dN)r   r"   r$   r   r%   r(   r   )r*   r,   r-   r.   r   r   r   test_issue587   s    
&&rN   c          	   C   s0   t | }tt |dg g W d Q R X d S )NZTEST)r   r6   r7   r8   r$   )rH   r-   r   r   r   test_issue588   s    rO   c              C   s,   t  } | jd t| dgd}|s(td S )NTZwhata)r=   )r
   r#   Z
set_frozenr   r(   )r"   r,   r   r   r   test_issue589   s    rP   c          
   C   s   t | ddddddddgd	}t| }|d
ddiddiddiddigg |d
ddiddiddigg ||}t|dkstdS )zTest overlapping matchesn=1;rJ   :5%)r=   abZIS_ALPHATr   ZLIKE_NUMr   N)r   r   r$   r%   r(   )rH   r,   r-   r.   r   r   r   test_issue590   s    " rY   c              C   s   dddddg} dt ttdii}t }|dd	d
dggi |dd	i i |dd	i i t|tjd}t||d}t	|| d}d|d _
|d jdkst|d jdkstdS )z Test lemmatization of base formsZDozn'tfeedZtherG   ZVBTZlemma_rulesZverbZedeZlemma_indexZ	lemma_exc)is_base_form)
lemmatizertag_map)r=   r   N)r   r   r	   r   Z	add_tabler   r   r\   r
   r   tag_r+   r(   lemma_)r=   r^   Zlookupsr]   r"   r,   r   r   r   test_issue595   s    
ra   c             C   s:   t | }d|_d|_t |j}||  |js6td S )NT)r   Z	is_taggedr?   r"   
from_bytesto_bytesr(   )rH   r,   Zdoc2r   r   r   test_issue599   s    
rd   c              C   s.   t dddiid} t| dgd}d|d _d S )NNNposZNOUN)r^   hello)r=   r   )r
   r   r_   )r"   r,   r   r   r   test_issue600   s    rh   c             C   sx   dd }d}ddiddig}d}| |}t |j}|j||g|d || t|j}|g ksbt|d	 jd	ksttd S )
Nc       	   	      s   |t |d krdS  fdd|D }  N}xF|D ]>}|jrDdn|jj}||jd}|j||d  j|f  _q6W W dQ R X dS )zMerge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match.r!   Nc                s"   g | ]\}}}t  |||d qS ))r)   )r   )r   r)   r   r   )r,   r   r   r      s    z8test_issue615.<locals>.merge_phrases.<locals>.<listcomp>ZNNP)taglemma)attrs)r%   Z
retokenizelabel_rootr_   r+   merger'   )	r-   r,   ir.   ZspansZretokenizerspanri   rk   r   )r,   r   merge_phrases   s    

z$test_issue615.<locals>.merge_phraseszThe golf club is brokenr   ZgolfZclubZSport_Equipment)Zon_matchr   )r   r"   r$   r&   r'   r(   r)   )r*   rq   r+   patternr)   r,   r-   entitiesr   r   r   test_issue615   s    

rt   ztext,number)Z7am7)z11p.m.Z11c             C   s.   | |}t |dkst|d j|ks*tdS )z`Test that times like "7am" are tokenized correctly and that numbers are
    converted to string.r   r   N)r%   r(   r+   )r*   r+   numberrA   r   r   r   test_issue736   s    rw   r+   z3/4/2012z
01/12/1900c             C   s   | |}t |dkstdS )zTest that dates are not split and kept as one token. This behaviour is
    currently inconsistent, since dates separated by hyphens are still split.
    This will be hard to prevent without causing clashes with numeric ranges.r!   N)r%   r(   )r*   r+   rA   r   r   r   test_issue740   s    rx   c              C   s>   t t ddg} | d }t|g}t|}|d |ks:td S )Nrg   Zworldr   )r   r
   setr&   r(   )r,   tokensitemsr   r   r   test_issue743   s
    
r}   zWe were scaredzWe Were Scaredc             C   s2   | |}t |dkst|d j dks.tdS )zqTest that 'were' and 'Were' are excluded from the contractions
    generated by the English tokenizer exceptions.r2   r!   wereN)r%   r(   r+   lower)r*   r+   rA   r   r   r   test_issue744   s    r   ztext,is_num)ZoneT)ZtenT)Z	tenelevenFc             C   s   | |}|d j |kstd S )Nr   )Zlike_numr(   )r*   r+   Zis_numrA   r   r   r   test_issue759  s    r   ZShellshellZShedZshedc             C   s.   | |}t |dkst|d j|ks*tdS )zsTest that 'Shell' and 'shell' are excluded from the contractions
    generated by the English tokenizer exceptions.r!   r   N)r%   r(   r+   )r*   r+   rA   r   r   r   test_issue775  s    r   zThis is a string c             C   s(   | |}d dd |D |ks$tdS )zGTest for Issue #792: Trailing whitespace is removed after tokenization. c             S   s   g | ]
}|j qS r   )text_with_ws)r   rz   r   r   r   r     s    z!test_issue792.<locals>.<listcomp>N)joinr(   )r*   r+   r,   r   r   r   test_issue792  s    r   zThis is a stringzThis is a string
c             C   s(   | |}d dd |D |ks$tdS )z6Test base case for Issue #792: Non-trailing whitespacer   c             S   s   g | ]
}|j qS r   )r   )r   rz   r   r   r   r   !  s    z)test_control_issue792.<locals>.<listcomp>N)r   r(   )r*   r+   r,   r   r   r   test_control_issue792  s    r   ztext,tokensz"deserve,"--and"Zdeservez,"--andzexception;--exclusive	exceptionz;--Z	exclusivezday.--Isdayz.--ZIszrefinement:--justZ
refinementz:--Zjustzmemories?--ToZmemoriesz?--TozUseful.=--ThereforeZUsefulz.=--Z	Thereforez=Hope.=--PandorarR   ZHopeZPandorac             C   s6   | |}t |t |kstdd |D |ks2tdS )z;Test that special characters + hyphens are split correctly.c             S   s   g | ]
}|j qS r   )r+   )r   r;   r   r   r   r   5  s    z!test_issue801.<locals>.<listcomp>N)r%   r(   )r*   r+   rA   r,   r   r   r   test_issue801$  s    r   ztext,expected_tokensu$   Smörsåsen används bl.a. till fisku   Smörsåsenu   användszbl.a.ZtillZfisku4   Jag kommer först kl. 13 p.g.a. diverse förseningarZJagZkommeru   förstzkl.Z13zp.g.a.Zdiverseu   förseningarc             C   s&   | |}dd |D }||ks"t d S )Nc             S   s   g | ]}|j s|jqS r   )Zis_spacer+   )r   rz   r   r   r   r   G  s    z!test_issue805.<locals>.<listcomp>)r(   )Zsv_tokenizerr+   Zexpected_tokensrA   Z
token_listr   r   r   test_issue8058  s    r   c              C   s   t tdd id} t| }ddiddiddig}|d	|g t|jdd
d
dgd}||}t|dkslt|d \}}}|dkst|dkstdS )zfThe variable-length pattern matches the succeeding token. Check we
    handle the ambiguity correctly.c             S   s   |   S )N)r   )stringr   r   r   <lambda>N      ztest_issue850.<locals>.<lambda>)lex_attr_gettersr   bobOP*frankFarAwayr   )r=   r!   r   r4   N)r
   r   r   r$   r   r"   r%   r(   )r"   r-   rr   r,   matchent_idr   r   r   r   r   test_issue850K  s    r   c              C   s   t tdd id} t| }ddiddddd	ig}|d
|g t|jdddd	gd}||}t|dksnt|d \}}}|dkst|dkstdS )z7Test Matcher matches with '*' operator and Boolean flagc             S   s   |   S )N)r   )r   r   r   r   r   \  r   z%test_issue850_basic.<locals>.<lambda>)r   r   r   r   r   )r   r   r   r   )r=   r!   r   r4   N)r
   r   r   r$   r   r"   r%   r(   )r"   r-   rr   r,   r   r   r   r   r   r   r   test_issue850_basicZ  s    r   zEFrench exception list is not enabled in the default tokenizer anymore)reasonu	   au-delàsu   pair-programmâmesu   terra-forméesu   σ-compactsc             C   s   | |}t |dkstdS )z=Test that French tokenizer exceptions are imported correctly.r!   N)r%   r(   )Zfr_tokenizerr+   rA   r   r   r   test_issue852h  s    r   zaaabbb@ccc.com
Thank you!zaaabbb@ccc.com 
Thank you!c             C   s   | |}|j |kstdS )z5Test that no extra space is added in doc.text method.N)r+   r(   )r*   r+   r,   r   r   r   test_issue859t  s    r   zDatum:2014-06-02
Dokument:76467c             C   sJ   | |}x<|D ]4}t |jt |jks*t||j |jd kstqW dS )zLTest that token.idx matches the original text index for texts with newlines.r   N)r%   r+   r   r(   rC   )r*   r+   r,   rz   r   r   r   test_issue886}  s    
r   z	want/needc             C   s.   | |}t |dkst|d jdks*tdS )z(Test that / infixes are split correctly.r2   r!   /N)r%   r(   r+   )r*   r+   rA   r   r   r   test_issue891  s    r   ztext,tag,lemma)anusre   r   )princessre   r   )innerZJJr   c             C   s.   t | |gd}||d _|d j|ks*tdS )zTest base-forms are preserved.)r=   r   N)r   r_   r`   r(   )rH   r+   ri   rj   r,   r   r   r   test_issue912  s    
r   c             C   sT   t d xDdD ]<}d}x"tddD ]}||t| 7 }q$W | |}|stqW dS )zTest that spaCy doesn't hang on many punctuation characters.
    If this test hangs, check (new) regular expressions for conflicting greedy operators
    Zpytest_timeout)	.,'r   rU   ?!rT   -0r!   d   N)r6   Zimportorskiprangestrr(   )r*   punctr   ro   r,   r   r   r   test_issue957  s    

r   c             C   s  dg gdg gdg gdg gdg gdg gddd	d
gggddddgggddddgggg	}t  }|d}|| x.|D ]&\}}x|D ]\}}}|| qW qrW |  d|j_x@tdD ]4}	t	| x$|D ]\}
}|
|
gd|ig qW qW t }|| t  |}W dQ R X xt|D ]l\}
}||
}dd |jD }xH|D ]2\}}}||f|krB|||f |ksptP qBW |rt|qW dS )a$  Test that adding entities and resuming training works passably OK.
    There are two issues here:
    1) We have to re-add labels. This isn't very nice.
    2) There's no way to set the learning rate for the weight update, so we
        end up out-of-scale, causing it to learn too fast.
    ZheyZhowdyz	hey thererg   hizi'm looking for a place to eatz,i'm looking for a place in the north of town   $   ZLOCATIONzshow me chinese restaurants      ZCUISINEzshow me chines restaurants   nergMbP?r   rs   Nc             S   s   i | ]}|j |j|jfqS r   )rl   Z
start_charZend_char)r   entr   r   r   
<dictcomp>  s    z!test_issue999.<locals>.<dictcomp>)r   Zcreate_pipeZadd_pipeZ	add_labelZbegin_trainingmodelZ
learn_rater   randomshuffleupdater   Zto_diskZ	from_diskr'   r(   	Exception)Z
train_dataZ
TRAIN_DATAZnlpr   r   offsetsr   r   r)   itnZraw_textZentity_offsetsZ	model_dirZnlp2r,   r'   r   r   r   test_issue999  sB    	



r   )E
__future__r   r6   r   Zspacy.matcherr   Zspacy.attrsr   r   r   Zspacy.symbolsr   r   r	   Zspacy.vocabr
   Zspacy.languager   Zspacy.lemmatizerr   Zspacy.lookupsr   Zspacy.tokensr   r   Zspacy.lang.enr   utilr   r   markZparametrizer/   r1   r9   rB   rD   rE   rI   rN   rO   ZxfailrP   rY   ra   rd   rh   rt   rw   rx   r}   r   r   r   r   r   r   r   r   r   skipr   r   r   r   r   Zslowr   r   r   r   r   r   <module>   s   &&					
"
		