B
    .(b              
   @   s   d dl mZ d dlZd dlmZ d dlmZmZ ddlm	Z	 dd Z
d	d
 Zdd Zdd Zdd Zdd Zdd Zejdddii gddii gddii gddiddigdddddigddigdd Zdd Zdd  Zd!d" ZdS )#    )unicode_literalsN)Vocab)DocToken   )get_docc          	   C   s  dddg}dddg}t | ||d}t|dks2ttt|dksFt|d jjdksZt|d jjdksnt| H}|j|d d	d
g|d df|d gdgd d	d
gdgd dd W d Q R X t|dkst|d jd	kst|d jjd
kst|d jdkst|d jdks"t|d jd
ks6t|d jjdksLt|d jdks`t|d jjdksvt|d jdkst|d jjdksttt|dkstd S )N
LosAngelesstart.   r   )wordsheads      LosAngelesZNNPr   ZGPE)taglemmaZent_type)attrs   )	r   lenAssertionErrorstrheadtext
retokenizesplitidx)en_vocabr   r   docretokenizer r!   T/home/dcms/DCMS/lib/python3.7/site-packages/spacy/tests/doc/test_retokenize_split.pytest_doc_retokenize_split   s6    


r#   c          	   C   s   t | dddgd}|jjd}|jjd}| 8}|j|d dd	g|d d
f|d
 gd||gid W d Q R X |d j|kst|d
 j|kstd S )Nr   r	   r
   )r   amodsubjectr   r   r   r   dep)r   )r   vocabstringsaddr   r   r&   r   )r   r   Zdep1Zdep2r    r!   r!   r"   &test_doc_retokenize_split_dependencies,   s    
r*   c          
   C   s   t | dddgd}tt6 | "}||d ddg|d g W d Q R X W d Q R X ttB | .}||d ddg|d |d |d g W d Q R X W d Q R X d S )	Nr   r	   r
   )r   r   r   r   r   )r   pytestraises
ValueErrorr   r   )r   r   r    r!   r!   r"   %test_doc_retokenize_split_heads_error;   s    
0
r.   c           	   C   s   dddg} t t | d}|jjdddfg|_|d jdksBt|d	 jd
ksTt| 8}|	|d dddg|d d	f|d df|d	 g W d Q R X |d jdkst|d	 jd
kst|d jd
kst|d jd
kstd S )Nabcde)r   zent-abcdr   r   Br   Iabcr   )
r   r   r'   r(   r)   ZentsZent_iob_r   r   r   )r   r   r    r!   r!   r"   *test_doc_retokenize_spans_entity_split_iobH   s    

<r7   c       
      C   sJ  ddddddddd	d
dddddg}dddddddddddddddg}dddddddddddd d!d"dg}t | |||d#}t|j\}}t|}t|}| j}	|	j|d d$d%g|d df|d gd&d"dgid' |	j|d( d)d*g|d( df|d+ gd&d"d,gid' W d Q R X t|j\}}t||d ks0tt||d ksFtd S )-NZ
StewartLeeisr4   ZstandZupZcomedianr
   ZHeZlivesinZEnglandandZlovesZJoePasqualer   r   r   ZnsubjROOTZdetr$   ZprtattrpunctprepZpobjccZconjZcompound)r   r   depsZStewartZLeer&   )r      ZJoeZPasquale   Zdobj)r   listZsentsr   r   r   r   )
r   r   r   rE   r   Zsent1Zsent2Zinit_lenZ	init_len2r    r!   r!   r"   5test_doc_retokenize_spans_sentence_update_after_splitW   s.    "
rI   c          
   C   sj   t | dddgd}ttD | 0}||d ddg|d df|d dfg W dQ R X W dQ R X dS )	al  Test that the regular retokenizer.split raises an error if the orths
    don't match the original token text. There might still be a method that
    allows this, but for the default use cases, merging and splitting should
    always conform with spaCy's non-destructive tokenization policy. Otherwise,
    it can lead to very confusing and unexpected results.
    r   r	   r
   )r   r   LAN)r   r+   r,   r-   r   r   )r   r   r    r!   r!   r"   (test_doc_retokenize_split_orths_mismatchu   s    
rL   c          	   C   s  t jdddd t jdddd t| ddgd	}| T}|d
 df|d g}dddddig}ddg|d}|j|d
 ddg||d W d Q R X |d
 jdkst|d
 jjdkst|d
 jj	dkst|d jdkst|d jjdkst|d jj	dkstd S )Nr4   FT)defaultforcer5   Znothingr   r	   )r   r   r   1)r4   r5   2ZlosZangeles)r   _r   r   )r   )
r   set_extensionr   r   r   Zlemma_r   rQ   r4   r5   )r   r   r    r   Z
underscorer   r!   r!   r"   )test_doc_retokenize_split_extension_attrs   s    
$rS   underscore_attrsr4   xr5   r6   )r4   rU   c          
   C   s   t jdddd t jddd dd t jd	d
d dd t| ddgd}d|i}ttH | 4}|d df|d g}|j|d ddg||d W d Q R X W d Q R X d S )NrU   FT)rM   rN   r4   c             S   s   | S )Nr!   )rU   r!   r!   r"   <lambda>       zCtest_doc_retokenize_split_extension_attrs_invalid.<locals>.<lambda>)getterrN   r5   c             S   s   | S )Nr!   )rU   r!   r!   r"   rV      rW   )methodrN   r   r	   )r   rQ   r   r   r   r   )r   )r   rR   r   r+   r,   r-   r   r   )r   rT   r   r   r    r   r!   r!   r"   1test_doc_retokenize_split_extension_attrs_invalid   s    
rZ   c          	   C   s   t | dgdd jrtt | dgdd jr0tt | ddgd}|d jrNt| @}ddd	gi}|d d
f|d
 g}|j|d ddg||d W dQ R X |d jst|d
 jrtdS )a  Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    r   )r   r   r   r   r	   is_stopTFr   )r   N)r   r[   r   r   r   )r   r   r    r   r   r!   r!   r"   $test_doc_retokenizer_split_lex_attrs   s    
$r\   c          	   C   s   d}t | | dd d}| <}|d }|dfgt| }|j||j t|j|d W dQ R X t | | d}| <}|d }|dfgt| }|j||j t|j|d W dQ R X dS )zB#4604: realloc correctly when new tokens outnumber original tokenszOHyperglycemic adverse events following antipsychotic drug administration in theNr;   )r   r   )r   )r   r   r   r   irH   r   )r   r   r   r    tokenr   r!   r!   r"   test_doc_retokenizer_realloc   s    
&
r_   c          	      s   d}t | | d}d|d _|d  | 2}|j dddd	d
g fddtdD d W dQ R X |d jdksxt|d jdkst|d jd	kst|d jd	kstdS )z#6060: reset norm in splitz6The quick brownfoxjumpsoverthe lazy dog w/ white spots)r   with   r   ZbrownZfoxZjumpsZoverZthec                s   g | ]} |fqS r!   r!   ).0r   )r^   r!   r"   
<listcomp>   s    z3test_doc_retokenizer_split_norm.<locals>.<listcomp>)r   N	   zw/)r   r   Znorm_r   ranger   r   )r   r   r   r    r!   )r^   r"   test_doc_retokenizer_split_norm   s    

6rf   )
__future__r   r+   Zspacy.vocabr   Zspacy.tokensr   r   utilr   r#   r*   r.   r7   rI   rL   rS   markZparametrizerZ   r\   r_   rf   r!   r!   r!   r"   <module>   s,   !


