B
    .(b                 @   s   d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ ejdd Zd	d
 Zdd Zdd Zdd ZdS )    )unicode_literalsN)English)	Tokenizer)compile_prefix_regexcompile_suffix_regex)compile_infix_regexc             C   sV   t tjj}ttjj}ddddg}t|}td}t	| tjj
|j|j|j|jdS )Nz\.\.\.+z(?<=[0-9])-(?=[0-9])z[0-9]+(,[0-9]+)+u   [\[\]!&:,()\*—–\/-]za-b)Ztoken_match)r   r   ZDefaultsprefixesr   suffixesr   recompiler   Ztokenizer_exceptionssearchfinditermatch)Zen_vocabZ	prefix_reZ	suffix_reZcustom_infixesZinfix_reZtoken_match_re r   \/home/dcms/DCMS/lib/python3.7/site-packages/spacy/tests/lang/en/test_customized_tokenizer.pycustom_en_tokenizer   s    
r   c             C   s   d}dd | |D }|dddddd	d
ddddddddddgksDt d}dd | |D }|ddddddd	d
ddddddddddgkst d S )Nz\The 8 and 10-county definitions are not used for the greater Southern California Megaregion.c             S   s   g | ]
}|j qS r   )text).0wordr   r   r   
<listcomp>$   s    z@test_en_customized_tokenizer_handles_infixes.<locals>.<listcomp>The8and10-countydefinitionsarenotusedforthegreaterSouthern
California
Megaregion.z]The 8- and 10-county definitions are not used for the greater Southern California Megaregion.c             S   s   g | ]
}|j qS r   )r   )r   r   r   r   r   r   :   s    )AssertionError)r   sentencecontextr   r   r   ,test_en_customized_tokenizer_handles_infixes"   sR    r*   c             C   sH   d}dd | |D }|dddddd	d
ddddddddddgksDt d S )Nz\The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion.c             S   s   g | ]
}|j qS r   )r   )r   r   r   r   r   r   S   s    zDtest_en_customized_tokenizer_handles_token_match.<locals>.<listcomp>r   r   r   r   r   r   r   za-br   r   r    r!   r"   r#   r$   r%   r&   )r'   )r   r(   r)   r   r   r   0test_en_customized_tokenizer_handles_token_matchQ   s(    r+   c             C   sJ   d}dd | |D }|dddddd	d
dddddddddddgksFt d S )Nz_The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)c             S   s   g | ]
}|j qS r   )r   )r   r   r   r   r   r   k   s    z>test_en_customized_tokenizer_handles_rules.<locals>.<listcomp>r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   z:))r'   )r   r(   r)   r   r   r   *test_en_customized_tokenizer_handles_rulesi   s*    r,   c             C   s^   d}| j }|d= || _ dd | |D }|ddddd	d
dddddddddddddgksZtd S )Nz_The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :)z:)c             S   s   g | ]
}|j qS r   )r   )r   r   r   r   r   r      s    zGtest_en_customized_tokenizer_handles_rules_property.<locals>.<listcomp>r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   :))rulesr'   )r   r(   r/   r)   r   r   r   3test_en_customized_tokenizer_handles_rules_property   s2    r0   )
__future__r   Zpytestr
   Zspacy.lang.enr   Zspacy.tokenizerr   Z
spacy.utilr   r   r   Zfixturer   r*   r+   r,   r0   r   r   r   r   <module>   s   /