B
    /`{F                 @   s   d dl mZmZmZmZmZmZmZ d dlZ	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(Z	G d
d deZ)G dd deZ*dS )    )AnyDictListTextTupleOptional
NamedTupleN)DOCS_URL_TRAINING_DATA_NLU)TrainingData)Message)Token)	Component)TOKENS_NAMES ENTITY_ATTRIBUTE_CONFIDENCE_TYPE ENTITY_ATTRIBUTE_CONFIDENCE_ROLE!ENTITY_ATTRIBUTE_CONFIDENCE_GROUP)TEXTINTENTENTITIESENTITY_ATTRIBUTE_VALUEENTITY_ATTRIBUTE_STARTENTITY_ATTRIBUTE_END	EXTRACTORENTITY_ATTRIBUTE_TYPEENTITY_ATTRIBUTE_GROUPENTITY_ATTRIBUTE_ROLENO_ENTITY_TAGSPLIT_ENTITIES_BY_COMMA%SPLIT_ENTITIES_BY_COMMA_DEFAULT_VALUE*SINGLE_ENTITY_ALLOWED_INTERLEAVING_CHARSETc               @   sB   e Zd ZU dZeed< eeef ed< eeef ed< eed< dS )EntityTagSpecz<Specification of an entity tag present in the training data.tag_nameZids_to_tagsZtags_to_idsZnum_tagsN)__name__
__module____qualname____doc__r   __annotations__r   int r(   r(   L/home/dcms/DCMS/lib/python3.7/site-packages/rasa/nlu/extractors/extractor.pyr    "   s
   
r    c               @   s  e Zd ZdZeeeef  eeeef  dddZeeef eeef dddZ	eee
f dd	d
ZeeeedddZeeeef eee eeef dddZee ee dddZed'eee eeee f eee
f eeeee f  eeeef  dddZeeeeef  eeee f edddZeeeeeee
f edddZeeeee f eeeddd Zed(ee eeeeeeeeee f  eeef d!d"d#Zeedd$d%d&ZdS ))EntityExtractorzEntity extractors are components which extract entities.

    They can be placed in the pipeline like other components, and can extract
    entities like a person's name, or a location.
    )entitiesreturnc             C   s   x|D ]}| j |t< qW |S )zAdds this extractor's name to a list of entities.

        Args:
            entities: the extracted entities.

        Returns:
            the modified entities.
        )namer   )selfr+   entityr(   r(   r)   add_extractor_name2   s    
z"EntityExtractor.add_extractor_name)r/   r,   c             C   s*   d|kr|d  | j n| jg|d< |S )zAdds this extractor's name to the list of processors for this entity.

        Args:
            entity: the extracted entity and its metadata.

        Returns:
            the modified entity.
        Z
processors)appendr-   )r.   r/   r(   r(   r)   add_processor_nameA   s    	z"EntityExtractor.add_processor_name)r,   c             C   s,   | j tt}| jtt}tjj||S )a  Initialises the behaviour for splitting entities by comma (or not).

        Returns:
            Defines desired behaviour for splitting specific entity types and
            default behaviour for splitting any entity types for which no
            behaviour is defined.
        )	Zcomponent_configgetr   r   defaultsrasautilsZtrain_utilsinit_split_entities)r.   split_entities_configdefault_valuer(   r(   r)   r7   Q   s    z#EntityExtractor.init_split_entities)	extractedrequested_dimensionsr,   c                s    r fdd| D S | S )z+Only return dimensions the user configured.c                s   g | ]}|t   kr|qS r(   )r   ).0r/   )r;   r(   r)   
<listcomp>h   s   z>EntityExtractor.filter_irrelevant_entities.<locals>.<listcomp>r(   )r:   r;   r(   )r;   r)   filter_irrelevant_entitiesc   s    
z*EntityExtractor.filter_irrelevant_entities)r/   texttokensr,   c             C   s   dd |D }dd |D }| t  |kr<d| |}t|| t |kr\d| |}t||| t  }|| t d }||fS )Nc             S   s   g | ]
}|j qS r(   )start)r<   tokenr(   r(   r)   r=   r   s    z/EntityExtractor.find_entity.<locals>.<listcomp>c             S   s   g | ]
}|j qS r(   )end)r<   rB   r(   r(   r)   r=   s   s    zWInvalid entity {} in example '{}': entities must span whole tokens. Wrong entity start.zUInvalid entity {} in example '{}': entities must span whole tokens. Wrong entity end.   )r   format
ValueErrorr   index)r/   r?   r@   offsetsZendsmessagerA   rC   r(   r(   r)   find_entityn   s    zEntityExtractor.find_entity)entity_examplesr,   c          
   C   s   g }x~|D ]v}g }x6| tg D ]&}| t}|r<|| jkr || q W |j }||t< |t| t||j	|j
|jd q
W |S )zFilters out untrainable entity annotations.

        Creates a copy of entity_examples in which entities that have
        `extractor` set to something other than
        self.name (e.g. 'CRFEntityExtractor') are removed.
        )r?   dataoutput_propertiestimefeatures)r3   r   r   r-   r1   rL   copyr   r   rM   rN   rO   )r.   rK   filteredrI   r+   entZ	extractorrL   r(   r(   r)   filter_trainable_entities   s"    



z)EntityExtractor.filter_trainable_entitiesN)r?   r@   tagsr8   confidencesr,   c          
   C   s  ddl m  m  m} g }t}t}t}	d}
xt|D ]\}}t|t|}|tkrdt}|j	}
q8t|t
|}||}t|t|}||}|	|kp||k}||r||ko|j||ko|j||k}|tkp|j||k}|p|p|}|}||}n||kp|}|}|rNtt| ||||||}|| ndt| ||
||r|j	|d t< |dk	rt||| n(tt| ||||||}|| |}	|}|j	}
q8W x&|D ]}| |t |t  |t< qW |S )aj  Convert predictions into entities.

        Args:
            text: The text message.
            tokens: Message tokens without CLS token.
            tags: Predicted tags.
            split_entities_config: config for handling splitting a list of entities
            confidences: Confidences of predicted tags.

        Returns:
            Entities.
        r   N)Zrasa.nlu.utils.bilou_utilsZnlur6   bilou_utilsr   	enumerater*   get_tag_forr   rC   r   Ztag_without_prefixr   Zbilou_prefix_from_tagZLASTZINSIDEZUNIT_create_new_entitylistkeysr1   _check_is_single_entityr   _update_confidence_valuesr   r   )r?   r@   rT   r8   rU   rW   r+   Zlast_entity_tagZlast_role_tagZlast_group_taglast_token_endidxrB   current_entity_tagZcurrent_group_tagZcurrent_role_tagZgroup_or_role_changedZnew_bilou_tag_startsZnew_unigram_bilou_tag_startsZnew_tag_foundr/   r(   r(   r)   !convert_predictions_into_entities   s    










z1EntityExtractor.convert_predictions_into_entities)r+   rU   r`   c             C   s   t | d t |t | | d t< t| d krPt | d t |t | | d t< t| d kr~t | d t |t | | d t< d S )NrV   )minr   r   r   r   r   r   )r+   rU   r`   r(   r(   r)   r^   /  s    


z)EntityExtractor._update_confidence_values)r?   rB   r_   r8   ra   c       
      C   sb   |j | dkrdS |j | dk}| ||j  }ttdd |}|t }|||}	|o`|o`|	 S )NrD   T   c             S   s   | t krdS dS )NTF)r   )charr(   r(   r)   <lambda>`  s    z9EntityExtractor._check_is_single_entity.<locals>.<lambda>)rA   allfilterr   r3   )
r?   rB   r_   r8   ra   Ztokens_within_rangeZinterleaving_textZ!tokens_separated_by_allowed_charsr9   Zsplit_current_entity_typer(   r(   r)   r]   C  s    z'EntityExtractor._check_is_single_entity)rT   r!   r`   r,   c             C   s   || kr| | | S t S )a  Get the value of the given tag name from the list of tags.

        Args:
            tags: Mapping of tag name to list of tags;
            tag_name: The tag name of interest.
            idx: The index position of the tag.

        Returns:
            The tag value.
        )r   )rT   r!   r`   r(   r(   r)   rY   s  s    zEntityExtractor.get_tag_for)	tag_names
entity_tag	group_tagrole_tagrB   r`   rU   r,   c             C   s   t |t|jt|ji}|dk	r,|t  | |t< t| kr\|tkr\||t< |dk	r\|t | |t< t	| kr|tkr||t	< |dk	r|t	 | |t
< |S )ap  Create a new entity.

        Args:
            tag_names: The tag names to include in the entity.
            entity_tag: The entity type value.
            group_tag: The entity group value.
            role_tag: The entity role value.
            token: The token.
            confidence: The confidence value.

        Returns:
            Created entity.
        N)r   r   rA   r   rC   r   r   r   r   r   r   )ri   rj   rk   rl   rB   r`   rU   r/   r(   r(   r)   rZ     s(    



z"EntityExtractor._create_new_entity)training_datar,   c       	      C   s   x| j D ]}dd |tD }dd |tt D }dd |tt D }x|D ]\}}||ksn||krVdd |tD }dd |tt D }tjjjj	d|t d|t
 d	| d
| d	td P qVW qW dS )a2  Check if entities are correctly annotated in the training data.

        If the start and end values of an entity do not match any start and end values
        of the respected token, we define an entity as misaligned and log a warning.

        Args:
            training_data: The training data.
        c             S   s   g | ]}|t  |t fqS r(   )r   r   )r<   r/   r(   r(   r)   r=     s   zDEntityExtractor.check_correct_entity_annotations.<locals>.<listcomp>c             S   s   g | ]
}|j qS r(   )rA   )r<   tr(   r(   r)   r=     s    c             S   s   g | ]
}|j qS r(   )rC   )r<   rn   r(   r(   r)   r=     s    c             S   s"   g | ]}|t  |t |t fqS r(   )r   r   r   )r<   r/   r(   r(   r)   r=     s   c             S   s   g | ]}|j |j|jfqS r(   )rA   rC   r?   )r<   rn   r(   r(   r)   r=     s   z)Misaligned entity annotation in message 'z' with intent 'z3'. Make sure the start and end values of entities (z3) in the training data match the token boundaries (z). Common causes: 
  1) entities include trailing whitespaces or punctuation
  2) the tokenizer gives an unexpected result, due to languages such as Chinese that don't use whitespace for word separation)ZdocsN)rK   r3   r   r   r   r5   Zsharedr6   ioZraise_warningr   r	   )	rm   ZexampleZentity_boundariesZtoken_start_positionsZtoken_end_positionsZentity_startZ
entity_endZentities_reprZtokens_reprr(   r(   r)    check_correct_entity_annotations  s     

(	z0EntityExtractor.check_correct_entity_annotations)NN)N)r"   r#   r$   r%   r   r   r   r   r0   r2   boolr7   staticmethodr[   setr>   r   r   r'   rJ   r   rS   r   floatrb   r^   r]   rY   rZ   r
   rp   r(   r(   r(   r)   r*   +   s@    
 4 ,
*"$(r*   )+typingr   r   r   r   r   r   r   Zrasa.shared.utils.ior5   Zrasa.shared.constantsr	   Z+rasa.shared.nlu.training_data.training_datar
   Z%rasa.shared.nlu.training_data.messager   Zrasa.nlu.tokenizers.tokenizerr   Zrasa.nlu.componentsr   Zrasa.nlu.constantsr   r   r   r   Zrasa.shared.nlu.constantsr   r   r   r   r   r   r   r   r   r   r   r   r   r   Zrasa.utils.train_utilsr    r*   r(   r(   r(   r)   <module>   s   $@	