B
    `1!                 @   s   d Z ddlZyddlmZ ddlmZ W n, ek
rT   dd Zdd Zd	d
 ZY nX e	dZ
G dd deZdd Zdd ZefddZefddZdd Zdd ZdS )z

A port of the Gale-Church Aligner.

Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
http://aclweb.org/anthology/J93-1004.pdf

    N)norm)logsfc             C   s   t | }ddd|   }|t| | d |d|d|d|d|d|d	|d
|d|d                   }| dkr|S d| S dS )zComplementary error function.   g      ?gś??g5 ?g`yg?gƸ?gꪂIǿg#v?g9)gS?gޅ1Ogv(?g        g       @N)absmathexp)xztr r   I/home/dcms/DCMS/lib/python3.7/site-packages/nltk/translate/gale_church.pyerfcc   s(    4r   c             C   s   ddt | td   S )u>   Return the area under the normal distribution from M{-∞..x}.r   g      ?   )r   r   sqrt)r   r   r   r   norm_cdfB   s    r   c             C   s0   yt dt|  S  tk
r*   tdS X d S )Nr   z-inf)r   logr   
ValueErrorfloat)r   r   r   r   
norm_logsfF   s    r   r   c               @   s&   e Zd ZdddddddZdZdZdS )	LanguageIndependentgׁsF?g{Gz?gbX9ȶ?gI+?))r   r   )r   r   )r   r   )r   r   )r   r   )r   r   r   g333333@N)__name__
__module____qualname__PRIORSAVERAGE_CHARACTERSVARIANCE_CHARACTERSr   r   r   r   r   P   s   r   c       	      C   s   g }t |t |f}x|dkrtdd |D ry| | \}}W n. tk
rn   |d d |d d f}wY nX xHt|D ]<}x6t|D ]*}||d | d |d | d f qW qzW |d | |d | f}qW |ddd S )a  
    Traverse the alignment cost from the tracebacks and retrieves
    appropriate sentence pairs.

    :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
    :type backlinks: dict
    :param source_sents_lens: A list of target sentences' lengths
    :type source_sents_lens: list(int)
    :param target_sents_lens: A list of target sentences' lengths
    :type target_sents_lens: list(int)
    )r   r   c             s   s   | ]}|d kV  qdS )r   Nr   ).0pr   r   r   	<genexpr>p   s    ztrace.<locals>.<genexpr>r   r   N)lenall	TypeErrorrangeappend)		backlinkssource_sents_lenstarget_sents_lenslinkspositionsr
   ijr   r   r   traceb   s    .r.   c       
         s   t  fddt|d D }t fddt|d D }y4|||j  d }||j | t||j  }	W n tk
r   tdS X tt	t
|	 t|j|   S )aP  Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
    being aligned with a specific C{alignment}.

    @param i: The offset of the source sentence.
    @param j: The offset of the target sentence.
    @param source_sents: The list of source sentence lengths.
    @param target_sents: The list of target sentence lengths.
    @param alignment: The alignment type, a tuple of two integers.
    @param params: The sentence alignment parameters.

    @returns: The log probability of a specific alignment between the two sentences, given the parameters.
    c             3   s   | ]} | d   V  qdS )r   Nr   )r   offset)r,   source_sentsr   r   r      s    z!align_log_prob.<locals>.<genexpr>r   c             3   s   | ]} | d   V  qdS )r   Nr   )r   r/   )r-   target_sentsr   r   r      s    r   r   z-inf)sumr$   r   r   r   r   ZeroDivisionErrorr   LOG2r   r   r   r   )
r,   r-   r0   r1   	alignmentparamsZl_sZl_tmdeltar   )r,   r-   r0   r1   r   align_log_prob~   s      
r9   c             C   s  t |j }g g}i }xtt| d D ]}xtt|d D ]}td}d}	xj|D ]b}
d|
d  }||
d  }|t| k sV|dk rqV|| | t||| ||
| }||k rV|}|
}	qVW |tdkrd}|	|||f< |d | q@W t|dkr|d |g  q*W t	|| |S )a  Return the sentence alignment of two text blocks (usually paragraphs).

        >>> align_blocks([5,5,5], [7,7,7])
        [(0, 0), (1, 1), (2, 2)]
        >>> align_blocks([10,5,5], [12,20])
        [(0, 0), (1, 1), (2, 1)]
        >>> align_blocks([12,20], [10,5,5])
        [(0, 0), (1, 1), (1, 2)]
        >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
        [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]

    @param source_sents_lens: The list of source sentence lengths.
    @param target_sents_lens: The list of target sentence lengths.
    @param params: the sentence alignment parameters.
    @return: The sentence alignments, a list of index pairs.
    r   infNr    r   r   )
listr   keysr$   r!   r   r9   r%   popr.   )r'   r(   r6   Zalignment_typesDr&   r,   r-   Zmin_distZ	min_alignaZprev_iZprev_jr   r   r   r   align_blocks   s2    

r@   c                s0   t | t |krtd fddt| |D S )a  Creates the sentence alignment of two texts.

    Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
    alignment links.

    Each block consists of a list that contains the lengths (in characters) of the sentences
    in this block.

    @param source_blocks: The list of blocks in the source text.
    @param target_blocks: The list of blocks in the target text.
    @param params: the sentence alignment parameters.

    @returns: A list of sentence alignment lists
    z>Source and target texts do not have the same number of blocks.c                s   g | ]\}}t || qS r   )r@   )r   Zsource_blockZtarget_block)r6   r   r   
<listcomp>   s   zalign_texts.<locals>.<listcomp>)r!   r   zip)Zsource_blocksZtarget_blocksr6   r   )r6   r   align_texts   s
    
rC   c             #   s&    fdd}x|   V  qW dS )zSplits an iterator C{it} at values of C{split_value}.

    Each instance of C{split_value} is swallowed. The iterator produces
    subiterators which need to be consumed fully before the next subiterator
    can be used.
    c             3   s$   | }x|kr|V     }qW d S )N)next)firstv)itsplit_valuer   r   _chunk_iterator   s    
z!split_at.<locals>._chunk_iteratorN)rD   )rG   rH   rI   r   )rG   rH   r   split_at   s    rJ   c                s    fddt | |D S )zParses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
    and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
    c                s    g | ]}d d t | D qS )c             S   s   g | ]}t d d |D qS )c             s   s   | ]}t |V  qd S )N)r!   )r   tokenr   r   r   r     s    z;parse_token_stream.<locals>.<listcomp>.<listcomp>.<genexpr>)r2   )r   Zsentence_itr   r   r   rA     s   z1parse_token_stream.<locals>.<listcomp>.<listcomp>)rJ   )r   Zblock_it)soft_delimiterr   r   rA     s   z&parse_token_stream.<locals>.<listcomp>)rJ   )streamrL   Zhard_delimiterr   )rL   r   parse_token_stream   s    
rN   )__doc__r   Zscipy.statsr   r   r   ImportErrorr   r   r   r4   objectr   r.   r9   r@   rC   rJ   rN   r   r   r   r   <module>   s    '
6