B
    ‰°bm\  ã               @   s*  d Z ddlZy
ejZW n4 ek
rJ   ddlZe dej ¡ dd„ ZY nX dd„ Zej 	¡  dZ
e e
¡ZG d	d
„ d
ƒZdd„ Zdd„ Zdd„ Zd4dd„ZdZd5dd„Zdd„ Zdd„ Zdd„ Zd6dd„Zdd„ Zd7d d!„Zd"d#„ Zd$d%„ Zd&d'„ Zd(d)„ Zd*d+„ Zd,d-„ Zd.d/„ Z d0d1„ Z!d2d3„ Z"dS )8a¸  A state-emitting MarkovModel.

Note terminology similar to Manning and Schutze is used.


Functions:
train_bw        Train a markov model using the Baum-Welch algorithm.
train_visible   Train a visible markov model using MLE.
find_states     Find the a state sequence that explains some observations.

load            Load a MarkovModel.
save            Save a MarkovModel.

Classes:
MarkovModel     Holds the description of a markov model
é    NzVFor optimal speed, please update to Numpy version 1.3 or later (current version is %s)c             C   sP   ||  dkr|S | | dkr | S t | |ƒ}|t t | | ¡t || ¡ ¡ S )z>Implement logaddexp method if Numpy version is older than 1.3.éd   )ÚminÚnumpyÚlogÚexp)ZlogxZlogyZminxy© r   ú.lib/python3.7/site-packages/Bio/MarkovModel.pyÚ	logaddexp*   s    
r	   c             C   sD   i }t | ddd… ƒ}t| ƒd }x|D ]\}}|| ||< q(W |S )zAReturn a dictionary of values with their sequence offset as keys.Néÿÿÿÿé   )Ú	enumerateÚlen)ÚvaluesÚdÚentriesÚnÚindexÚkeyr   r   r   Ú	itemindex4   s    r   gYóøÂn¥c               @   s"   e Zd ZdZddd„Zdd„ ZdS )ÚMarkovModelz+Create a state-emitting MarkovModel object.Nc             C   s"   || _ || _|| _|| _|| _dS )zInitialize the class.N)ÚstatesÚalphabetÚ	p_initialÚp_transitionÚ
p_emission)Úselfr   r   r   r   r   r   r   r   Ú__init__G   s
    zMarkovModel.__init__c             C   s.   ddl m} |ƒ }t| |ƒ | d¡ | ¡ S )z9Create a string representation of the MarkovModel object.r   )ÚStringIO)Úior   ÚsaveÚseekÚread)r   r   Úhandler   r   r   Ú__str__Q   s
    

zMarkovModel.__str__)NNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r#   r   r   r   r   r   D   s   
	r   c             C   s&   |   ¡ }| |¡s"td||f ƒ‚|S )zNRead the first line and evaluate that begisn with the correct start (PRIVATE).zI expected %r but got %r)ÚreadlineÚ
startswithÚ
ValueError)r"   ÚstartÚliner   r   r   Ú_readline_and_check_start[   s    
r-   c             C   s~  t | dƒ}| ¡ dd… }t | dƒ}| ¡ dd… }t||ƒ}t|ƒt|ƒ }}t |¡|_t | dƒ}x<tt|ƒƒD ],}t | d||  ƒ}t| ¡ d ƒ|j|< qtW t ||f¡|_	t | dƒ}xNtt|ƒƒD ]>}t | d||  ƒ}d	d
„ | ¡ dd… D ƒ|j	|dd…f< qÌW t ||f¡|_
t | dƒ}xPtt|ƒƒD ]@}t | d||  ƒ}dd
„ | ¡ dd… D ƒ|j
|dd…f< q6W |S )z.Parse a file handle into a MarkovModel object.zSTATES:r   Nz	ALPHABET:zINITIAL:z  %s:r
   zTRANSITION:c             S   s   g | ]}t |ƒ‘qS r   )Úfloat)Ú.0Úvr   r   r   ú
<listcomp>|   s    zload.<locals>.<listcomp>z	EMISSION:c             S   s   g | ]}t |ƒ‘qS r   )r.   )r/   r0   r   r   r   r1   ƒ   s    )r-   Úsplitr   r   r   Úzerosr   Úranger.   r   r   )r"   r,   r   r   ÚmmÚNÚMÚir   r   r   Úloadc   s,    




,
.r9   c          	   C   s  |j }|dd | j¡ ƒ |dd | j¡ ƒ |dƒ x2tt| jƒƒD ] }|d| j| | j| f ƒ qFW |dƒ xBtt| jƒƒD ]0}|d| j| d dd	„ | j| D ƒ¡f ƒ q‚W |d
ƒ xBtt| jƒƒD ]0}|d| j| d dd	„ | j| D ƒ¡f ƒ qÎW dS )z$Save MarkovModel object into handle.zSTATES: %s
ú zALPHABET: %s
z	INITIAL:
z	  %s: %g
zTRANSITION:
z	  %s: %s
c             s   s   | ]}t |ƒV  qd S )N)Ústr)r/   Úxr   r   r   ú	<genexpr>“   s    zsave.<locals>.<genexpr>z
EMISSION:
c             s   s   | ]}t |ƒV  qd S )N)r;   )r/   r<   r   r   r   r=   –   s    N)	ÚwriteÚjoinr   r   r4   r   r   r   r   )r5   r"   Úwr8   r   r   r   r   ˆ   s     0r   c          	      s  t | ƒt |ƒ }}|stdƒ‚|dk	rDt |¡}|j|fkrDtdƒ‚|dk	rlt |¡}|j||fkrltdƒ‚|dk	r”t |¡}|j||fkr”tdƒ‚g }	t|ƒ‰ x$|D ]}
|	 ‡ fdd„|
D ƒ¡ q¦W dd„ |	D ƒ}t|ƒd	krètd
ƒ‚t|||	||||d}|\}}}t	| ||||ƒS )a  Train a MarkovModel using the Baum-Welch algorithm.

    Train a MarkovModel using the Baum-Welch algorithm.  states is a list
    of strings that describe the names of each state.  alphabet is a
    list of objects that indicate the allowed outputs.  training_data
    is a list of observations.  Each observation is a list of objects
    from the alphabet.

    pseudo_initial, pseudo_transition, and pseudo_emission are
    optional parameters that you can use to assign pseudo-counts to
    different matrices.  They should be matrices of the appropriate
    size that contain numbers to add to each parameter matrix, before
    normalization.

    update_fn is an optional callback that takes parameters
    (iteration, log_likelihood).  It is called once per iteration.
    zNo training data given.Nz$pseudo_initial not shape len(states)z5pseudo_transition not shape len(states) X len(states)z5pseudo_emission not shape len(states) X len(alphabet)c                s   g | ]}ˆ | ‘qS r   r   )r/   r<   )Úindexesr   r   r1   Ê   s    ztrain_bw.<locals>.<listcomp>c             S   s   g | ]}t |ƒ‘qS r   )r   )r/   r<   r   r   r   r1   Í   s    r   z,I got training data with outputs of length 0)Úpseudo_initialÚpseudo_transitionÚpseudo_emissionÚ	update_fn)
r   r*   r   ÚasarrayÚshaper   Úappendr   Ú_baum_welchr   )r   r   Útraining_datarB   rC   rD   rE   r6   r7   Útraining_outputsÚoutputsZlengthsr<   r   r   r   r   )rA   r   Útrain_bwš   s@    




rM   iè  c
             C   sh  |dkrt | ƒ}nt|| fƒ}|dkr4t | | fƒ}nt|| | fƒ}|dkrXt | |fƒ}nt|| |fƒ}t |¡}
t |¡}t |¡}|dk	r˜t |¡}nd}|dk	r°t |¡}nd}|dk	rÈt |¡}nd}d}x‚ttƒD ]j}t}x(|D ] }|t| |||
|||||ƒ	7 }qèW |	dk	r |	||ƒ |dk	r@t || ¡dk r@P |}qÚW t	dt ƒ‚dd„ |
||fD ƒS )zfImplement the Baum-Welch algorithm to evaluate unknown parameters in the MarkovModel object (PRIVATE).Ngš™™™™™¹?z%HMM did not converge in %d iterationsc             S   s   g | ]}t  |¡‘qS r   )r   r   )r/   Ú_r   r   r   r1   )  s    z_baum_welch.<locals>.<listcomp>)
Ú_random_normÚ_copy_and_checkr   r   r4   ÚMAX_ITERATIONSÚLOG0Ú_baum_welch_oneÚfabsÚRuntimeError)r6   r7   rK   r   r   r   rB   rC   rD   rE   Ú
lp_initialÚlp_transitionÚlp_emissionÚlpseudo_initialÚlpseudo_transitionÚlpseudo_emissionZ	prev_llikr8   ZllikrL   r   r   r   rI   â   sT    






rI   c	          	   C   sà  t |ƒ}	t| |	||||ƒ}
t| |	|||ƒ}t | | |	f¡}x¨t|	ƒD ]œ}|| }t | | f¡}xbt| ƒD ]V}xPt| ƒD ]D}|
| | || |  || |  || |d   }||| |< qvW qhW |t|ƒ |dd…dd…|f< qDW t | |	f¡}xBt|	ƒD ]6}x0t| ƒD ]$}t||dd…|f ƒ|| |< q
W qüW t | ¡}x*t| ƒD ]}t||dd…f ƒ||< qJW |dd…df }|dk	rœt||ƒ}|t|ƒ }x€t| ƒD ]t}x8t| ƒD ],}t|||dd…f ƒ||  || |< q´W |dk	r¦t|| |ƒ||< || t|| ƒ ||< q¦W x¬t| ƒD ] }t |¡t }xJt|	ƒD ]>}|| }x.t| ƒD ]"}t	|| ||||f ƒ||< qZW qDW |t|ƒ }|dk	r¶t||| ƒ}|t|ƒ }|||dd…f< q(W t|
dd…|	f ƒS )zÊExecute one step for Baum-Welch algorithm (PRIVATE).

    Do one iteration of Baum-Welch based on a sequence of output.
    Changes the value for lp_initial, lp_transition and lp_emission in place.
    r   Nr   )
r   Ú_forwardÚ	_backwardr   r3   r4   Ú_logsumÚ
_logvecaddrR   r	   )r6   r7   rL   rV   rW   rX   rY   rZ   r[   ÚTZfmatZbmatZlp_arcÚtÚkZlp_traverser8   ÚjÚlpZlp_arcout_tZ	lp_arcoutZksumr   r   r   rS   ,  sR    	4"(


,
(
	rS   c             C   s°   t  | |d f¡}||dd…df< xˆtd|d ƒD ]v}||d  }xdt| ƒD ]X}	t}
xBt| ƒD ]6}|| |d  || |	  || |  }t|
|ƒ}
q^W |
||	 |< qLW q2W |S )zŽImplement forward algorithm (PRIVATE).

    Calculate a Nx(T+1) matrix, where the last column is the total
    probability of the output.
    r   Nr   )r   r3   r4   rR   r	   )r6   r`   rV   rW   rX   rL   Úmatrixra   rb   rc   Úlprobr8   rd   r   r   r   r\   Š  s    (r\   c             C   sž   t  | |d f¡}x†t|d ddƒD ]r}|| }xdt| ƒD ]X}t}	xBt| ƒD ]6}
||
 |d  || |
  || |  }t|	|ƒ}	qLW |	|| |< q:W q$W |S )z'Implement backward algorithm (PRIVATE).r   r
   )r   r3   r4   rR   r	   )r6   r`   rW   rX   rL   re   ra   rb   r8   rf   rc   rd   r   r   r   r]   ¡  s    (r]   c                s*  t | ƒt |ƒ }}|dk	r8t |¡}|j|fkr8tdƒ‚|dk	r`t |¡}|j||fkr`tdƒ‚|dk	rˆt |¡}|j||fkrˆtdƒ‚g g  }}	t| ƒ‰t|ƒ‰ xX|D ]P\}
}t |ƒt |
ƒkrÈtdƒ‚| ‡fdd„|D ƒ¡ |	 ‡ fdd„|
D ƒ¡ q¨W t|||	||||ƒ}|\}}}t| ||||ƒS )	a  Train a visible MarkovModel using maximum likelihoood estimates for each of the parameters.

    Train a visible MarkovModel using maximum likelihoood estimates
    for each of the parameters.  states is a list of strings that
    describe the names of each state.  alphabet is a list of objects
    that indicate the allowed outputs.  training_data is a list of
    (outputs, observed states) where outputs is a list of the emission
    from the alphabet, and observed states is a list of states from
    states.

    pseudo_initial, pseudo_transition, and pseudo_emission are
    optional parameters that you can use to assign pseudo-counts to
    different matrices.  They should be matrices of the appropriate
    size that contain numbers to add to each parameter matrix.
    Nz$pseudo_initial not shape len(states)z5pseudo_transition not shape len(states) X len(states)z5pseudo_emission not shape len(states) X len(alphabet)zstates and outputs not alignedc                s   g | ]}ˆ | ‘qS r   r   )r/   r<   )Ústates_indexesr   r   r1   ß  s    z!train_visible.<locals>.<listcomp>c                s   g | ]}ˆ | ‘qS r   r   )r/   r<   )Úoutputs_indexesr   r   r1   à  s    )	r   r   rF   rG   r*   r   rH   Ú_mler   )r   r   rJ   rB   rC   rD   r6   r7   Útraining_statesrK   ZtoutputsZtstatesr<   r   r   r   r   )rh   rg   r   Útrain_visible±  s>    




rk   c             C   s¶  t  | ¡}|r|| }x |D ]}||d   d7  < qW t|ƒ}t  | | f¡}	|rZ|	| }	xP|D ]H}xBtt|ƒd ƒD ].}
||
 ||
d   }}|	||f  d7  < qvW q`W xDtt|	ƒƒD ]4}|	|dd…f t|	|dd…f ƒ |	|dd…f< qºW t  | |f¡}|r|| }t  | |f¡}xFt||ƒD ]8\}}x,t||ƒD ]\}}|||f  d7  < q<W q(W xFtt|ƒƒD ]6}||dd…f t||dd…f ƒ ||dd…f< qrW ||	|fS )z<Implement Maximum likelihood estimation algorithm (PRIVATE).r   r   N)r   r3   Ú
_normalizer4   r   ÚsumÚonesÚzip)r6   r7   rK   rj   rB   rC   rD   r   r   r   r   r8   rc   r   rL   ÚoÚsr   r   r   ri   ð  s2    


4 6ri   c             C   s   t  | ¡gS )z?Return indeces of the maximum values aong the vector (PRIVATE).)r   Zargmax)ZvectorZ	allowancer   r   r   Ú	_argmaxes  s    rr   c       
         s°   | ‰t ˆjƒ}t ˆjt ¡}t ˆjt ¡}t ˆjt ¡}tˆj	ƒ‰ ‡ fdd„|D ƒ}t
|||||ƒ}x@tt |ƒƒD ]0}|| \}}	‡fdd„|D ƒt |	¡f||< qxW |S )zaFind states in the given Markov model output.

    Returns a list of (states, score) tuples.
    c                s   g | ]}ˆ | ‘qS r   r   )r/   r<   )rA   r   r   r1   2  s    zfind_states.<locals>.<listcomp>c                s   g | ]}ˆ j | ‘qS r   )r   )r/   r<   )r5   r   r   r1   9  s    )r   r   r   r   r   ÚVERY_SMALL_NUMBERr   r   r   r   Ú_viterbir4   r   )
Zmarkov_modelÚoutputr6   rV   rW   rX   Úresultsr8   r   Úscorer   )rA   r5   r   Úfind_states#  s    

$rx   c             C   s¬  t |ƒ}g }x t| ƒD ]}| dg| ¡ qW t | |f¡}||dd…|d f  |dd…df< x„td|ƒD ]v}	||	 }
xht| ƒD ]\}|dd…|	d f |dd…|f  |||
f  }t|ƒ}||d  |||	f< ||| |	< q‚W qlW g }g }t|dd…|d f ƒ}x0|D ](}| |d |g|| |d  f¡ qW xn|r¦| ¡ \}	}}|	dkrh| ||f¡ n:||d  |	 }x(|D ] }| |	d |g| |f¡ q~W q:W |S )zSImplement Viterbi algorithm to find most likely states for a given input (PRIVATE).Nr   r   )r   r4   rH   r   r3   rr   Úpop)r6   rV   rW   rX   ru   r`   Z	backtracer8   Zscoresra   rb   rc   Zi_scoresrA   Z
in_processrv   r   rw   r   r   r   rt   =  s4    $0
(

&rt   c             C   s‚   t | jƒdkr | tt| ƒƒ } n^t | jƒdkrvxNtt | ƒƒD ]4}| |dd…f t| |dd…f ƒ | |dd…f< q<W ntdƒ‚| S )z"Normalize matrix object (PRIVATE).r   é   Nz&I cannot handle matrixes of that shape)r   rG   r.   rm   r4   r*   )re   r8   r   r   r   rl   f  s    6rl   c             C   s   t  | ¡}t|ƒS )z%Normalize a uniform matrix (PRIVATE).)r   rn   rl   )rG   re   r   r   r   Ú_uniform_norms  s    
r{   c             C   s   t j | ¡}t|ƒS )z$Normalize a random matrix (PRIVATE).)r   Úrandomrl   )rG   re   r   r   r   rO   y  s    rO   c             C   s¦   t j| dd} | j|kr tdƒ‚t| jƒdkrNt  t| ƒd ¡dkr¢tdƒ‚nTt| jƒdkršxDtt| ƒƒD ]*}t  t| | ƒd ¡dkrjtd| ƒ‚qjW ntd	ƒ‚| S )
zFCopy a matrix and check its dimension. Normalize at the end (PRIVATE).r   )ÚcopyzIncorrect dimensiong      ð?g{®Gáz„?zmatrix not normalized to 1.0rz   zmatrix %d not normalized to 1.0z&I don't handle matrices > 2 dimensions)r   ZarrayrG   r*   r   rT   rm   r4   )re   Zdesired_shaper8   r   r   r   rP     s    

rP   c             C   sJ   t | jƒdkr&t | t | j¡f¡}n| }t}x|D ]}t||ƒ}q4W |S )z/Implement logsum for a matrix object (PRIVATE).r   )r   rG   r   ZreshapeÚproductrR   r	   )re   Zvecrm   Znumr   r   r   r^   “  s    
r^   c             C   sV   t | ƒt |ƒkstdƒ‚t t | ƒ¡}x*tt | ƒƒD ]}t| | || ƒ||< q4W |S )z5Implement a log sum for two vector objects (PRIVATE).zvectors aren't the same length)r   ÚAssertionErrorr   r3   r4   r	   )Zlogvec1Zlogvec2Zsumvecr8   r   r   r   r_   Ÿ  s
    r_   c             C   s   t | ƒ}t |¡S )z-Return the exponential of a logsum (PRIVATE).)r^   r   r   )Znumbersrm   r   r   r   Ú_exp_logsum¨  s    r€   )NNNN)NNNNNNN)NNN)N)#r'   r   r	   ÚAttributeErrorÚwarningsÚwarnÚ__version__r   r|   Zseedrs   r   rR   r   r-   r9   r   rM   rQ   rI   rS   r\   r]   rk   ri   rr   rx   rt   rl   r{   rO   rP   r^   r_   r€   r   r   r   r   Ú<module>   s\   





%   
>      
@^  
9.
)	