B
    .(b7                 @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZmZ d dlmZ d dlmZ d dlZd dlZd dlmZ ddl m!Z! dd Z"dd Z#dd Z$dd Z%dd Z&dd Z'ej(j)edddd Z*d d! Z+d"d# Z,d$d% Z-d&d' Z.d(d) Z/d*d+ Z0erRd,Z1d-d. Z2d/d0 Z3d1d2 Z4d3d4 Z5ej(j6d5dd6d7 Z7ej(8d8d9d: Z9d;d< Z:dS )=    )unicode_literalsN)English)German)EntityRulerEntityRecognizer)MatcherPhraseMatcher)Doc)Vocab)ENT_IOBENT_TYPE)pickle
is_python2unescape_unicode)displacy)decaying)Vectors   )get_docc              C   s"   t  } | d}t|dkstdS )z;Test that the tokenizer doesn't hang on a long list of dotszW880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl   N)r   lenAssertionError)nlpdoc r   Y/home/dcms/DCMS/lib/python3.7/site-packages/spacy/tests/regression/test_issue3001-3500.pytest_issue3002   s    r   c       	      C   s   ddiddiddiddigddidd	d
dddiddiddigddidd	ddddiddiddigg}dddddg}dddddg}t | ||d}t| }x6t|D ]*\}}|t||g ||}|stqW dS )z%Test problem with matcher quantifiersZLEMMAZhaveLOWERtoZdoZTAGINTF*)ZIS_ASCIIZIS_PUNCTOP?ZalsoZhaswithZRBVBZZTOZVB)wordstagsN)r   r   	enumerateaddstrr   )	en_vocabpatternsr%   r&   r   matcheripatternmatchesr   r   r   test_issue3009   s&    

r0   c             C   s  dddddg}dddd	dg}d
ddddg}dg}t | ||||d}|jsLtd}|d j|d j|d j|d jf|ks|tttg}|	|}|
|| |d j|d j|d j|d jf|kst| }	t| |	}
|
d j|
d j|
d j|
d jf|ks
tdS )ziTest that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information.Thisis10%.ZDTr$   CDZNNZDETZVERBNUMZNOUNZPUNCT)r      PERCENT)r%   r&   posents)r3   r7   r6   r9   r   N)r   Z	is_taggedr   textZpos_Ztag_Z	ent_type_r   r   Zto_arrayZ
from_arrayto_bytesr	   
from_bytes)r*   r%   r&   r:   r;   r   expectedheaderZ	ent_array	doc_bytesZdoc2r   r   r   test_issue3012<   s    
,
,rB   c              C   s:   t t ddddgd} d| _t| dd jg ks6td	S )
zTest that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and setting is_parsed to make sure the noun chunks run.
    r1   r2   aZsentence)r%   Tr      N)r	   r
   Z	is_parsedlistZnoun_chunksr   )r   r   r   r   test_issue3199U   s    rF   c              C   s   t  } | d}| | |d |   dddddg}|j|ksHtt  }||d ||   |	dj|kstdS )	zTest issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    nerZANIMALOzB-ANIMALzI-ANIMALzL-ANIMALzU-ANIMALN)
r   create_pipeadd_pipe	add_labelbegin_training
move_namesr   r>   r=   Zget_pipe)r   rG   rM   Znlp2r   r   r   test_issue3209_   s    


rN   c              C   sT   t  } t| j}|d| d| d| dg |d| dg t|dksPtdS )	zdTest that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns.TEST1rC   bcTEST2dr   N)r   r   vocabr(   r   r   )r   r,   r   r   r   test_issue3248_1r   s
    
rU   z,Can't pickle instancemethod for is_base_form)reasonc              C   sl   t  } t| j}|d| d| d| dg |d| dg t|}t|}t|t|kshtdS )z5Test that the PhraseMatcher can be pickled correctly.rO   rC   rP   rQ   rR   rS   N)	r   r   rT   r(   r   dumpsloadsr   r   )r   r,   dataZnew_matcherr   r   r   test_issue3248_2|   s    


rZ   c             C   sR   | d}t |dkst|d jdks*t|d jdks<t|d jdksNtdS )	z2Test that hyphens are split correctly as prefixes.uA   —Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.   r   u   —r   u   –	   N)r   r   r<   )Zes_tokenizerr   r   r   r   test_issue3277   s
    r]   c             C   sr   ddddddddg}d	d
dd	d
d	ddg}ddddddddg}t | |||d}tjt|dfdd|_t| dS )zTest that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized.HelloZWorld!ZWhenr2   thisZbreakingr"      r   ZintjROOTpunctZadvmodZdetZnsubj)r%   headsdeps`   float32)dtypeN)r   numpyzerosr   Ztensorr   render)r*   r%   rg   rh   r   r   r   r   test_issue3288   s    ro   c              C   sB   t  } | | d |  }t  }|| d || dS )zeTest that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model.ZtextcatN)r   rJ   rI   r=   r>   )r   
bytes_dataZnew_nlpr   r   r   test_issue3289   s    rq   c          	      s   t | dddddddgd t| }d	d
ddgiigd	d
ddgiigg}|d| | }t|dksjt fdd|D }|ddddgkstd S )Nr^   ,howZareZyouZdoingr"   )r%   r   r   helloZTESTr8   c                s    g | ]\}}} || j qS r   )r<   ).0_startend)r   r   r   
<listcomp>   s    z"test_issue3328.<locals>.<listcomp>)r	   r   r(   r   r   )r*   r,   r+   r/   Zmatched_textsr   )r   r   test_issue3328   s    rz   c             C   s   t | }|dt| ddgdg |dt| ddgdg t| ddddgd}||}t|dksht| j|d	 d	  | j|d
 d	  g}t|ddgkstdS )zcTest that duplicate patterns for different rules result in multiple
    matches, one per rule.
    AZBarackZObama)r%   BZliftsZAmericar   r   ra   N)r   r(   r	   r   r   stringssorted)r*   r,   r   r/   Z	match_idsr   r   r   test_issue3331   s    $r   c              C   s   t  } t| jdddddgd}d|d _t| d	d
dgd}t|j}|jdd |d	 ||}|j	|gd }|j
|d |j
|d |j
|d |j|dstdS )z8Test case where preset entity crosses sentence boundary.IZliveinZNewZYork)r%   Tr8   ZGPEzNew York)labelr.   )r+   r    r   rH   zB-GPEN)r   r	   rT   is_sent_startr   r   movesZ
add_actionrK   Z
init_batchZapply_transitionZis_validr   )r   r   ZrulerrG   stater   r   r   test_issue3345   s    


r   s
  ^§|^%|^=|^—|^–|^\+(?![0-9])|^…|^……|^,|^:|^;|^\!|^\?|^¿|^؟|^¡|^\(|^\)|^\[|^\]|^\{|^\}|^<|^>|^_|^#|^\*|^&|^。|^？|^！|^，|^、|^；|^：|^～|^·|^।|^،|^؛|^٪|^\.\.+|^…|^\'|^"|^”|^“|^`|^‘|^´|^’|^‚|^,|^„|^»|^«|^「|^」|^『|^』|^（|^）|^〔|^〕|^【|^】|^《|^》|^〈|^〉|^\$|^£|^€|^¥|^฿|^US\$|^C\$|^A\$|^₽|^﷼|^₴|^[\u00A6\u00A9\u00AE\u00B0\u0482\u058D\u058E\u060E\u060F\u06DE\u06E9\u06FD\u06FE\u07F6\u09FA\u0B70\u0BF3-\u0BF8\u0BFA\u0C7F\u0D4F\u0D79\u0F01-\u0F03\u0F13\u0F15-\u0F17\u0F1A-\u0F1F\u0F34\u0F36\u0F38\u0FBE-\u0FC5\u0FC7-\u0FCC\u0FCE\u0FCF\u0FD5-\u0FD8\u109E\u109F\u1390-\u1399\u1940\u19DE-\u19FF\u1B61-\u1B6A\u1B74-\u1B7C\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116\u2117\u211E-\u2123\u2125\u2127\u2129\u212E\u213A\u213B\u214A\u214C\u214D\u214F\u218A\u218B\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D3\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u23B4-\u23DB\u23E2-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u266E\u2670-\u2767\u2794-\u27BF\u2800-\u28FF\u2B00-\u2B2F\u2B45\u2B46\u2B4D-\u2B73\u2B76-\u2B95\u2B98-\u2BC8\u2BCA-\u2BFE\u2CE5-\u2CEA\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3004\u3012\u3013\u3020\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u31C0-\u31E3\u3200-\u321E\u322A-\u3247\u3250\u3260-\u327F\u328A-\u32B0\u32C0-\u32FE\u3300-\u33FF\u4DC0-\u4DFF\uA490-\uA4C6\uA828-\uA82B\uA836\uA837\uA839\uAA77-\uAA79\uFDFD\uFFE4\uFFE8\uFFED\uFFEE\uFFFC\uFFFD\U00010137-\U0001013F\U00010179-\U00010189\U0001018C-\U0001018E\U00010190-\U0001019B\U000101A0\U000101D0-\U000101FC\U00010877\U00010878\U00010AC8\U0001173F\U00016B3C-\U00016B3F\U00016B45\U0001BC9C\U0001D000-\U0001D0F5\U0001D100-\U0001D126\U0001D129-\U0001D164\U0001D16A-\U0001D16C\U0001D183\U0001D184\U0001D18C-\U0001D1A9\U0001D1AE-\U0001D1E8\U0001D200-\U0001D241\U0001D245\U0001D300-\U0001D356\U0001D800-\U0001D9FF\U0001DA37-\U0001DA3A\U0001DA6D-\U0001DA74\U0001DA76-\U0001DA83\U0001DA85\U0001DA86\U0001ECAC\U0001F000-\U0001F02B\U0001F030-\U0001F093\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F110-\U0001F16B\U0001F170-\U0001F1AC\U0001F1E6-\U0001F202\U0001F210-\U0001F23B\U0001F240-\U0001F248\U0001F250\U0001F251\U0001F260-\U0001F265\U0001F300-\U0001F3FA\U0001F400-\U0001F6D4\U0001F6E0-\U0001F6EC\U0001F6F0-\U0001F6F9\U0001F700-\U0001F773\U0001F780-\U0001F7D8\U0001F800-\U0001F80B\U0001F810-\U0001F847\U0001F850-\U0001F859\U0001F860-\U0001F887\U0001F890-\U0001F8AD\U0001F900-\U0001F90B\U0001F910-\U0001F93E\U0001F940-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2\U0001F9B0-\U0001F9B9\U0001F9C0-\U0001F9C2\U0001F9D0-\U0001F9FF\U0001FA60-\U0001FA6D]c              C   s&   t ttd} | dr"td S )Nutf8rt   )recompiler   prefix_searchdecodesearchr   )r.   r   r   r   test_issue3356  s    r   c           	   C   s   ddg} t  }t|j}t|j}t  t|j| dd}W d Q R X t  t|jj| dd}W d Q R X t  t|j|dd W d Q R X t  t|j|dd W d Q R X d S )NzHello worldzThis is a testr8   )Z	n_threads)	r   r   rT   r   pytestZdeprecated_callrE   pipe	tokenizer)Ztextsr   r,   ZphrasematcherZdocsr   r   r   test_issue3410  s    





r   c              C   sv   t jdddgdddgdddggdd	} t| d
ddgd}|t jdddgdddggdd	\}}}|d dksrtd S )Nr   ra   r   rD   r\         f)rk   r{   r|   C)rY   keys)rl   Zasarrayr   Zmost_similarr   )rY   Zvectorsr   Z	best_rowsZscoresr   r   r   test_issue3412)  s
    &&r   c              C   sL   t ddd} t| }|dks tt| }|dks4tt| }|dksHtd S )Ng      $@g      ?g      ?g      #@g      "@)r   nextr   )sizessizer   r   r   test_issue34472  s    r   z;default suffix rules avoid one upper-case letter before dotc              C   st   t  } | | d d}d}d}| |}| |}| |}|d jdksLt|d jdks^t|d jdksptd S )Nsentencizerz>He gave the ball to I. Do you want to go to the movies with I?z?He gave the ball to I.  Do you want to go to the movies with I?z>He gave the ball to I.
Do you want to go to the movies with I?r   r   )r   rJ   rI   r<   r   )r   Ztext1Ztext2Ztext3t1t2t3r   r   r   test_issue3449<  s    r   zignore::UserWarningc              C   s4   t  } | | d |   t| ddg d S )NZtaggerhir   )r   rJ   rI   rL   rE   r   )r   r   r   r   test_issue3456K  s    r   c              C   s   t  } | | d | d}|d js,t|js6ttt|jdksLt|	 }t
| j|}|d jsrt|js|ttt|jdkstdS )zlTest that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization.r   zHello worldr   ra   N)r   rJ   rI   r   r   Zis_sentencedr   rE   Zsentsr=   r	   rT   r>   )r   r   rA   Znew_docr   r   r   test_issue3468T  s    

r   );
__future__r   r   Zspacy.lang.enr   Zspacy.lang.der   Zspacy.pipeliner   r   Zspacy.matcherr   r   Zspacy.tokensr	   Zspacy.vocabr
   Zspacy.attrsr   r   Zspacy.compatr   r   r   Zspacyr   Z
spacy.utilr   rl   r   Zspacy.vectorsr   utilr   r   r0   rB   rF   rN   rU   markZskipifrZ   r]   ro   rq   rz   r   r   r   r   r   r   r   Zxfailr   filterwarningsr   r   r   r   r   r   <module>   sJ   	

	:	
	