B
    ,(bl*                 @   sF  d dl mZmZ d dlZd dlmZmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ edddddddgZ d*ddZ!dd Z"d+d d!Z#G d"d# d#eZ$G d$d% d%ej%Z&G d&d' d'eZ'd(d) Z(e)e'e( d'gZ*dS ),    )unicode_literalsprint_functionN)
namedtupleOrderedDict   )
STOP_WORDS)SYNTAX_ITERATORS)TAG_MAP)TAG_ORTH_MAP)TAG_BIGRAM_MAP   )LANG)copy_reg)Errors)Language)POS)Doc)DummyTokenizer)utilDetailedTokensurfacetaginflemmareading
sub_tokensAc             C   sp   yNddl m}m} |jjj|jjj|jjj|jjjd|  } | j	| d}|S  t
k
rj   t
dY nX dS )zSudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it.
    split_mode should be one of these values: "A", "B", "C", None->"A".r   )
dictionary	tokenizer)Nr   BC)modezJapanese support requires SudachiPy and SudachiDict-core (https://github.com/WorksApplications/SudachiPy). Install with `pip install sudachipy sudachidict_core` or install spaCy with `pip install spacy[ja]`.N)Z	sudachipyr   r   	Tokenizer	SplitModer   r   r    Z
DictionarycreateImportError)
split_moder   r   tok r(   E/home/dcms/DCMS/lib/python3.7/site-packages/spacy/lang/ja/__init__.pytry_sudachi_import   s    r*   c             C   st   |t kr$t | }| |kr$||  dfS |rd||f}|tkrdt| \}}|dkr\t| t |fS ||fS t| t dfS )a2  If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function returns resolved POSs for both token
    and next_token by tuple.
    N)r
   r   r	   r   )Zorthr   Znext_tagZorth_mapZ
tag_bigramZcurrent_posnext_posr(   r(   r)   resolve_pos3   s    r,      空白c          
   C   s  dd | D }d d | d | krDttjj||dg }g }d}t|dkrd||fS tdd |D dkr| stt	||d|d d g}dg}||fS x t
t|| D ]\}\}}	| rqy||d  |}
W n* tk
r   ttjj||dY nX |
dkrV||||
  }|t	||d|d d  |d ||
7 }||	 |d |t|7 }|d t| k r| |d  jd	krd
|d< |d7 }qW |t|k r||d  }|t	||d|d d  |d ||fS )Nc             S   s   g | ]
}|j qS r(   )r   ).0xr(   r(   r)   
<listcomp>T   s    z*get_dtokens_and_spaces.<locals>.<listcomp> )textwordsr   c             S   s   g | ]}|  s|qS r(   )isspace)r.   wordr(   r(   r)   r0   ^   s    Fr    T)joinsplit
ValueErrorr   ZE194formatlenr4   AssertionErrorr   	enumeratezipindexappendr   )dtokensr2   Zgap_tagr3   Ztext_dtokensZtext_spacesZtext_posir5   dtokenZ
word_startwr(   r(   r)   get_dtokens_and_spacesR   sH    "



"
rF   c               @   sh   e Zd Zdi fddZdd ZdddZd	d
 Zdd Zi fddZdd Z	dd Z
dd Zdd ZdS )JapaneseTokenizerNc             C   s8   |d k	r|j n||| _ |dd | _t| j| _d S )Nr&   )vocabZcreate_vocabgetr&   r*   r   )selfclsnlpconfigr(   r(   r)   __init__   s    zJapaneseTokenizer.__init__c             C   s  | j |}| |}t||\}}|r0t| ng gd \}}}}}	}
t|
}
t| j||d}d }x|tt||D ]j\}\}}|j	|_
|r||_d }n2t|j|j	|d t|k r||d  nd \|_}|jr|jn|j|_qrW ||jd< |	|jd< |
|jd< d|_|S )N   )r3   spacesr   inflectionsZreading_formsr   T)r   tokenize_get_dtokensrF   r?   listr   rH   r>   r   Ztag_posr,   Zorth_r<   r   r   Zlemma_	user_dataZ	is_tagged)rJ   r2   sudachipy_tokensrB   rP   r3   tagsrQ   ZlemmasZreadingssub_tokens_listdocr+   idxtokenrD   r(   r(   r)   __call__   s,    
"(


zJapaneseTokenizer.__call__Tc                s>   |r|  |nd fddt|D   fddt D S )Nc                s   g | ]z\}}t | d krt| ddd | dd D ddd | dd D | |  rx | ndqS )r   -c             S   s   g | ]}|d kr|qS )*r(   )r.   xxr(   r(   r)   r0      s    z=JapaneseTokenizer._get_dtokens.<locals>.<listcomp>.<listcomp>N   ,c             S   s   g | ]}|d kr|qS )r_   r(   )r.   r`   r(   r(   r)   r0      s    )r<   r   r   r8   Zpart_of_speechZdictionary_formZreading_form)r.   r[   r\   )rY   r(   r)   r0      s   z2JapaneseTokenizer._get_dtokens.<locals>.<listcomp>c                sT   g | ]L\}}|d ksL|j  rL|jdksL |d  j  rL |d  jdkr|qS )r   u   空白r   )r   r4   r   )r.   r[   t)rB   r(   r)   r0      s    )_get_sub_tokensr>   )rJ   rW   Zneed_sub_tokensr(   )rB   rY   r)   rS      s    
zJapaneseTokenizer._get_dtokensc             C   s   | j d ks| j dkrd S g }x|D ]}|| jjj}t|dkrN|d  q"| j dkrn|| |dg q"|| jjj}t|t|kr| |d}|||g q"|| |d| |dg q"W |S )Nr   r   r   F)	r&   r9   r   r#   r   r<   rA   rS   r   )rJ   rW   rY   r\   Zsub_aZsub_brB   r(   r(   r)   rd      s    

"z!JapaneseTokenizer._get_sub_tokensc             C   s   t d| jff}|S )Nr&   )r   r&   )rJ   rM   r(   r(   r)   _get_config   s    zJapaneseTokenizer._get_configc             C   s   | dd | _d S )Nr&   )rI   r&   )rJ   rM   r(   r(   r)   _set_config   s    zJapaneseTokenizer._set_configc                s"   t d fddff}t|g S )Ncfgc                  s   t   S )N)srslyZ
json_dumpsre   r(   )rJ   r(   r)   <lambda>       z,JapaneseTokenizer.to_bytes.<locals>.<lambda>)r   r   to_bytes)rJ   kwargsserializersr(   )rJ   r)   rk      s    zJapaneseTokenizer.to_bytesc                s4   t d fddff}t||g  t j _ S )Nrg   c                s     t| S )N)rf   rh   Z
json_loads)b)rJ   r(   r)   ri      rj   z.JapaneseTokenizer.from_bytes.<locals>.<lambda>)r   r   
from_bytesr*   r&   r   )rJ   datarl   Zdeserializersr(   )rJ   r)   ro      s
    zJapaneseTokenizer.from_bytesc                s.   t |}td fddff}t ||g S )Nrg   c                s   t |   S )N)rh   
write_jsonre   )p)rJ   r(   r)   ri      rj   z+JapaneseTokenizer.to_disk.<locals>.<lambda>)r   ensure_pathr   to_disk)rJ   pathrl   rm   r(   )rJ   r)   rt      s    
zJapaneseTokenizer.to_diskc                s>   t |}td fddff}t ||g  t j _d S )Nrg   c                s     t| S )N)rf   rh   	read_json)rr   )rJ   r(   r)   ri     rj   z-JapaneseTokenizer.from_disk.<locals>.<lambda>)r   rs   r   	from_diskr*   r&   r   )rJ   ru   rl   rm   r(   )rJ   r)   rw      s
    
zJapaneseTokenizer.from_disk)T)__name__
__module____qualname__rN   r]   rS   rd   re   rf   rk   ro   rt   rw   r(   r(   r(   r)   rG      s    

	rG   c               @   sN   e Zd ZeejjZdd ee< eZ	e
ZeZddddZedi fddZdS )	JapaneseDefaultsc             C   s   dS )Njar(   )Z_textr(   r(   r)   ri   
  rj   zJapaneseDefaults.<lambda>ZltrF)	directionZhas_caseZhas_lettersNc             C   s   t | ||S )N)rG   )rK   rL   rM   r(   r(   r)   create_tokenizer  s    z!JapaneseDefaults.create_tokenizer)rx   ry   rz   dictr   DefaultsZlex_attr_gettersr   r   
stop_wordsr	   tag_mapr   syntax_iteratorsZwriting_systemclassmethodr~   r(   r(   r(   r)   r{     s   r{   c               @   s   e Zd ZdZeZdd ZdS )Japaneser|   c             C   s
   |  |S )N)r   )rJ   r2   r(   r(   r)   make_doc  s    zJapanese.make_docN)rx   ry   rz   langr{   r   r   r(   r(   r(   r)   r     s   r   c             C   s
   t t fS )N)r   tuple)instancer(   r(   r)   pickle_japanese  s    r   )r   )r-   )+
__future__r   r   rh   collectionsr   r   r   r   r   r   r   r	   Ztag_orth_mapr
   Ztag_bigram_mapr   attrsr   compatr   errorsr   languager   symbolsr   tokensr   r   r   r1   r   r*   r,   rF   rG   r   r{   r   r   pickle__all__r(   r(   r(   r)   <module>   s4   

5 