B
    /`W                 @   sh  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlm  m  mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lm Z  d d
lm!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 e 4e5Z6ej7rFd dl8m9Z9 G dd dZ:G dd deZ;dS )    N)AnyDictListOptionalTextTupleTypeCallable)determine_token_labels)POS_TAG_KEY)RasaNLUModelConfig)	Tokenizer)	Component)EntityExtractor)Metadata)Token)TrainingData)Message)TOKENS_NAMES)TEXTENTITIESENTITY_ATTRIBUTE_TYPEENTITY_ATTRIBUTE_GROUPENTITY_ATTRIBUTE_ROLENO_ENTITY_TAGSPLIT_ENTITIES_BY_COMMA)DOCS_URL_COMPONENTS)
BILOU_FLAG)CRFc               @   s0   e Zd Zeeeeef ejeeedddZdS )CRFToken)textpos_tagpatterndense_features
entity_tagentity_role_tagentity_group_tagc             C   s.   || _ || _|| _|| _|| _|| _|| _d S )N)r    r!   r"   r#   r$   r%   r&   )selfr    r!   r"   r#   r$   r%   r&    r(   W/home/dcms/DCMS/lib/python3.7/site-packages/rasa/nlu/extractors/crf_entity_extractor.py__init__(   s    
zCRFToken.__init__N)	__name__
__module____qualname__r   r   r   npZndarrayr*   r(   r(   r(   r)   r   '   s   
r   c                   sZ  e Zd ZU eeee  dddZede	dddddgdd	d
ddddddddgdddggdddddddg iZ
dd dd dd dd dd dd dd dd d d d!d d"d d#d d$d d%d d&d d'd d(Zeeeegef f ed)< ddeeeef  eeed+f  d*d, fd-d.Zd*dd/d0Zeee dd1d2Zdeeee ed*d3d4d5Zed6d7d8Zeed*d9d:d;Zeeeeef  d<d=d>Zee eeeeeef  f d?d@dAZeeeef  eee ee f dBdCdDZ ee! eeeeeef  f eeeee f eeee f f dEdFdGZ"edfeeef ee#ed  ed dHdIdJZ$eeeeeef  dKdLdMZ%dgee e&eeeef  dOdPdQZ'ee e(e(e)e&dRdSdTZ*e+ee eee dUdVdWZ,e+ee(eee&f dXdYdZZ-eee d<d[d\Z.eee d<d]d^Z/eeeee f d<d_d`Z0eee  d*dadbdcZ1  Z2S )hCRFEntityExtractor)returnc             C   s   t gS )N)r   )clsr(   r(   r)   required_components<   s    z&CRFEntityExtractor.required_componentsTfeatureslowtitleupperbiasprefix5prefix2suffix5suffix3suffix2digitr"   max_iterations2   L1_cg?L2_cfeaturizersc             C   s
   | j  S )N)r    lower)	crf_tokenr(   r(   r)   <lambda>k       zCRFEntityExtractor.<lambda>c             C   s
   | j  S )N)r    istitle)rD   r(   r(   r)   rE   l   rF   c             C   s   | j d d S )N   )r    )rD   r(   r(   r)   rE   m   rF   c             C   s   | j d d S )N   )r    )rD   r(   r(   r)   rE   n   rF   c             C   s   | j dd  S )N)r    )rD   r(   r(   r)   rE   o   rF   c             C   s   | j dd  S )N)r    )rD   r(   r(   r)   rE   p   rF   c             C   s   | j dd  S )N)r    )rD   r(   r(   r)   rE   q   rF   c             C   s   | j dd  S )N)r    )rD   r(   r(   r)   rE   r   rF   c             C   s   dS )Nr7   r(   )rD   r(   r(   r)   rE   s   rF   c             C   s   | j S )N)r!   )rD   r(   r(   r)   rE   t   rF   c             C   s   | j d k	r| j d d S d S )NrI   )r!   )rD   r(   r(   r)   rE   u   s    c             C   s
   | j  S )N)r    isupper)rD   r(   r(   r)   rE   x   rF   c             C   s
   | j  S )N)r    isdigit)rD   r(   r(   r)   rE   y   rF   c             C   s   | j S )N)r"   )rD   r(   r(   r)   rE   z   rF   c             C   s   | j S )N)r#   )rD   r(   r(   r)   rE   {   rF   c             C   s   | j S )N)r$   )rD   r(   r(   r)   rE   |   rF   )r4   r5   r8   r9   r:   r;   r<   Zsuffix1r7   pospos2r6   r=   r"   text_dense_featuresentityfunction_dictNr   )component_configentity_taggersr0   c                s4   t  | || _tttg| _|   |  | _	d S )N)
superr*   rV   r   r   r   	crf_order_validate_configurationZinit_split_entitiessplit_entities_config)r'   rU   rV   )	__class__r(   r)   r*      s    zCRFEntityExtractor.__init__c             C   s&   t | jdg d dkr"tdd S )Nr3   rI      z>Need an odd number of crf feature lists to have a center word.)lenrU   get
ValueError)r'   r(   r(   r)   rY      s    z*CRFEntityExtractor._validate_configurationc             C   s   ddgS )Nsklearn_crfsuiteZsklearnr(   )r1   r(   r(   r)   required_packages   s    z$CRFEntityExtractor.required_packages)training_dataconfigkwargsr0   c                sh   |j std d S  |  jt r2t|  |  	|j
} fdd|D } | d S )NzQNo training examples with entities present. Skip trainingof 'CRFEntityExtractor'.c                s   g | ]}  |qS r(   )_convert_to_crf_tokens).0Zexample)r'   r(   r)   
<listcomp>   s    z,CRFEntityExtractor.train.<locals>.<listcomp>)entity_examplesloggerdebugZ check_correct_entity_annotationsrU   r   bilou_utilsZapply_bilou_schema_update_crf_orderZfilter_trainable_entitiesZnlu_examples_train_model)r'   rb   rc   rd   rh   Zdatasetr(   )r'   r)   train   s    



zCRFEntityExtractor.train)rb   c             C   sj   g }xZ| j D ]P}|tkr*|jr*|t q|tkrD|jrD|t q|tkr|jr|t qW || _ dS )z3Train only CRFs we actually have training data for.N)rX   r   entitiesappendr   Zentity_rolesr   Zentity_groups)r'   rb   Z
_crf_ordertag_namer(   r(   r)   rl      s    z$CRFEntityExtractor._update_crf_order)messagerd   r0   c             K   s4   |  |}| |}|jt|tg | dd d S )NT)Zadd_to_output)extract_entitiesZadd_extractor_namesetr   r^   )r'   rr   rd   ro   r(   r(   r)   process   s    

zCRFEntityExtractor.process)rr   r0   c             C   s   | j dkrg S |tt }| |}i }xH| j  D ]:\}}|tk}|rV| || | ||}|	|||< q6W | 
||\}	}
| |t||	| j|
S )zCExtract entities from the given message using the trained model(s).N)rV   r^   r   r   re   itemsr   _add_tag_to_crf_token_crf_tokens_to_featuresZpredict_marginals_single_tag_confidencesZ!convert_predictions_into_entitiesrZ   )r'   rr   tokens
crf_tokenspredictionsrq   entity_taggerinclude_tag_featuresr3   tagsconfidencesr(   r(   r)   rs      s    

z#CRFEntityExtractor.extract_entities)r{   r|   c             C   s<   t |kr8| |t  \}}xt||D ]\}}||_q&W dS )z(Add predicted entity tags to CRF tokens.N)r   _most_likely_tagzipr$   )r'   r{   r|   _tags_tagtokenr(   r(   r)   rw      s    z(CRFEntityExtractor._add_tag_to_crf_token)r|   r0   c                sv   g }g }xd|D ]\t fddd |  | jt r\|t fdd D  q|   qW ||fS )zGet the entity tags with the highest confidence.

        Args:
            predictions: list of mappings from entity tag to confidence value

        Returns:
            List of entity tags and list of confidence values.
        c                s    |  S )Nr(   )key)token_predictionsr(   r)   rE     rF   z5CRFEntityExtractor._most_likely_tag.<locals>.<lambda>)r   c             3   s*   | ]"\}}t  t |kr|V  qd S )N)rk   Ztag_without_prefix)rf   Z_tagZ_confidence)r   r(   r)   	<genexpr>
  s   z6CRFEntityExtractor._most_likely_tag.<locals>.<genexpr>)maxrp   rU   r   sumrv   )r'   r|   r   _confidencesr(   )r   r   r)   r      s    



z#CRFEntityExtractor._most_likely_tag)rz   r|   r0   c       	      C   sv   i }i }xd|  D ]X\}}t|t|kr2td| |\}}| jt rZt||\}}|||< |||< qW ||fS )zBGet most likely tag predictions with confidence values for tokens.z>Inconsistency in amount of tokens between crfsuite and message)rv   r]   	Exceptionr   rU   r   rk   Zensure_consistent_bilou_tagging)	r'   rz   r|   r   r   rq   Zpredicted_tagsr   r   r(   r(   r)   ry     s    
z#CRFEntityExtractor._tag_confidences)meta	model_dirmodel_metadatacached_componentrd   r0   c          	   K   s   dd l }|d}i }|s>tdtj| d | |dS x^| D ]R\}	}
tj||
}tj	|rz|
|||	< qHtd|	 dtj| d qHW | ||S )Nr   fileszFailed to load model for 'CRFEntityExtractor'. Maybe you did not provide enough training data and no model was trained or the path 'z' doesn't exist?)rU   zFailed to load model for tag 'zq' for 'CRFEntityExtractor'. Maybe you did not provide enough training data and no model was trained or the path ')joblibr^   ri   rj   ospathabspathrv   joinexistsload)r1   r   r   r   r   rd   r   
file_namesrV   name	file_nameZ
model_filer(   r(   r)   r   .  s    	

 zCRFEntityExtractor.load)r   r   r0   c             C   sd   ddl }i }| jr\xH| j D ]:\}}| d| d}tj||}||| |||< qW d|iS )znPersist this model into the passed directory.

        Returns the metadata necessary to load the model again.r   N.z.pklr   )r   rV   rv   r   r   r   dump)r'   r   r   r   r   r   r}   Zmodel_file_namer(   r(   r)   persistR  s    zCRFEntityExtractor.persistF)r{   r~   r0   c       
      C   sd   | j d }g }xPtt|D ]@}t|}|d }t| |d }| |||||}	||	 qW |S )z2Convert the list of tokens into discrete features.r3   rI   r\   )rU   ranger]   _create_features_for_tokenrp   )
r'   r{   r~   configured_featuresZsentence_features	token_idxZwindow_sizehalf_window_sizewindow_rangetoken_featuresr(   r(   r)   rx   d  s    
z*CRFEntityExtractor._crf_tokens_to_features)r{   r   r   r   r~   c          
   C   s  | j d }dd |D }i }x|D ]}	||	 }
|
t|krDd|d< q"|
dk rVd|d< q"||
 }|	| }|| }|| }g }|r|d xv|| D ]j}|d	kr| j| |}xN| D ] \}}||| d
| d
| < qW q| j| |}||| d
| < qW q"W |S )zTConvert a token into discrete features including word before and word
        after.r3   c             S   s   g | ]}t |qS r(   )str)rf   ir(   r(   r)   rg     s    zACRFEntityExtractor._create_features_for_token.<locals>.<listcomp>TZEOSr   ZBOSrS   r"   :)rU   r]   rp   rT   rv   )r'   r{   r   r   r   r~   r   prefixesr   Zpointer_positionZcurrent_token_idxr   Zcurrent_feature_idxr3   prefixZadditional_featuresfeatureZregex_patternsZpattern_namematchedvaluer(   r(   r)   r     s0    




z-CRFEntityExtractor._create_features_for_token)r{   rq   r0   c             C   s:   |t krdd | D S |tkr,dd | D S dd | D S )z/Return the list of tags for the given tag name.c             S   s   g | ]
}|j qS r(   )r%   )rf   rD   r(   r(   r)   rg     s    z:CRFEntityExtractor._crf_tokens_to_tags.<locals>.<listcomp>c             S   s   g | ]
}|j qS r(   )r&   )rf   rD   r(   r(   r)   rg     s    c             S   s   g | ]
}|j qS r(   )r$   )rf   rD   r(   r(   r)   rg     s    )r   r   )r{   rq   r(   r(   r)   _crf_tokens_to_tags  s
    z&CRFEntityExtractor._crf_tokens_to_tags)rr   idxr0   c             C   s0   |  tt dk	r,|  tt |  di S i S )a  Get the patterns of the token at the given index extracted by the
        'RegexFeaturizer'.

        The 'RegexFeaturizer' adds all patterns listed in the training data to the
        token. The pattern name is mapped to either 'True' (pattern applies to token) or
        'False' (pattern does not apply to token).

        Args:
            message: The message.
            idx: The token index.

        Returns:
            The pattern dict.
        Nr"   )r^   r   r   )rr   r   r(   r(   r)   _pattern_of_token  s    z$CRFEntityExtractor._pattern_of_tokenc       	      C   s   | t| jd \}}|dkr"dS |tt }t|t|jkrvtjj	j
jdt|j dt| dtd d dS g }x2|jD ](}dd	 t|D }d
|i}|| qW |S )z9Convert dense features to python-crfsuite feature format.rB   NzNumber of dense features (z8) for attribute 'TEXT' does not match number of tokens (z).z#crfentityextractor)Zdocsc             S   s   i | ]\}}|t |qS r(   )r   )rf   indexr   r(   r(   r)   
<dictcomp>  s   z:CRFEntityExtractor._get_dense_features.<locals>.<dictcomp>rR   )Zget_dense_featuresr   rU   r^   r   r]   r3   rasaZsharedutilsioZraise_warningr   	enumeraterp   )	r'   rr   r3   r   rz   Zfeatures_outr   Zfeature_dictZ	convertedr(   r(   r)   _get_dense_features  s"    
z&CRFEntityExtractor._get_dense_featuresc             C   s   g }| tt }| |}| |}xt|D ]z\}}| ||}| |t|}	| |t	|}
| |t
|}| t}|dk	r|| ng }|t|j||	|
|||d q0W |S )z1Take a message and convert it to crfsuite format.N)r    r!   r$   r&   r%   r"   r#   )r^   r   r   r   	_get_tagsr   r   Zget_tag_forr   r   r   r   rp   r   r    )r'   rr   Z
crf_formatrz   rR   r   r   r   r"   rS   grouproler!   r#   r(   r(   r)   re     s*    


z)CRFEntityExtractor._convert_to_crf_tokensc                s|     tt }i }xd| jD ]Z| jt rXt}  |rH  |}qldd |D }n fdd|D }||< qW |S )z$Get assigned entity tags of message.c             S   s   g | ]}t qS r(   )r   )rf   r   r(   r(   r)   rg   &  s    z0CRFEntityExtractor._get_tags.<locals>.<listcomp>c                s    g | ]}t | td qS ))Zattribute_key)r
   r^   r   )rf   r   )rr   rq   r(   r)   rg   )  s   )r^   r   r   rX   rU   r   rk   Zget_bilou_key_for_tag)r'   rr   rz   r   Z	bilou_keyr   r(   )rr   rq   r)   r     s    


zCRFEntityExtractor._get_tags)df_trainr0   c                s   ddl }i _xjD ]td d tk  fdd|D }fdd|D }|jdjd	 jd
 jd dd}||| |j< td qW dS )z0Train the crf tagger based on the training data.r   NzTraining CRF for 'z'.c                s   g | ]} | qS r(   )rx   )rf   sentence)r~   r'   r(   r)   rg   >  s   z3CRFEntityExtractor._train_model.<locals>.<listcomp>c                s   g | ]}  |qS r(   )r   )rf   r   )r'   rq   r(   r)   rg   B  s    Zlbfgsr@   rA   r>   T)	algorithmZc1c2r>   Zall_possible_transitionszTraining finished.)	r`   rV   rX   ri   rj   r   r   rU   Zfit)r'   r   r`   ZX_trainZy_trainr}   r(   )r~   r'   rq   r)   rm   2  s"    
zCRFEntityExtractor._train_model)NN)N)NNN)F)3r+   r,   r-   classmethodr   r   r   r2   r   r   defaultsrT   r   r   r	   r   r   __annotations__r   r*   rY   ra   r   r   rn   rl   r   ru   rs   floatrw   r   r   r   ry   r   r   r   boolrx   intr   r   staticmethodr   r   r   re   r   rm   __classcell__r(   r(   )r[   r)   r/   ;   s   

& 
"	!.  
8	!r/   )<loggingr   typingnumpyr.   r   r   r   r   r   r   r   r	   Zrasa.nlu.utils.bilou_utilsZnlur   rk   Zrasa.shared.utils.ior   Zrasa.nlu.testr
   Z#rasa.nlu.tokenizers.spacy_tokenizerr   Zrasa.nlu.configr   Zrasa.nlu.tokenizers.tokenizerr   Zrasa.nlu.componentsr   Zrasa.nlu.extractors.extractorr   Zrasa.nlu.modelr   r   Z+rasa.shared.nlu.training_data.training_datar   Z%rasa.shared.nlu.training_data.messager   Zrasa.nlu.constantsr   Zrasa.shared.nlu.constantsr   r   r   r   r   r   r   Zrasa.shared.constantsr   Zrasa.utils.tensorflow.constantsr   	getLoggerr+   ri   TYPE_CHECKINGr`   r   r   r/   r(   r(   r(   r)   <module>   s2   ($	
