B
    b#P                 @   s   d Z ddlZyddlZW n( ek
r@   ddlmZ edY nX ddlmZ ddl	m
Z
 G dd	 d	eZG d
d deZG dd deZG dd deZdS )zSupport for various forms of sequence motif matrices.

Implementation of frequency (count) matrices, position-weight matrices,
and position-specific scoring matrices.
    N)MissingPythonDependencyErrorz3Install NumPy if you want to use Bio.motifs.matrix.)Seq   )_pwmc               @   s`   e Zd ZdZdd Zdd Zdd Zedd	 Zed
d Z	edd Z
edd Zdd ZdS )GenericPositionMatrixz9Base class for the support of position matrix operations.c             C   sb   d| _ xP|D ]H}| j dkr*t|| | _ n| j t|| krDtdt|| | |< qW || _dS )zInitialize the class.Nzdata has inconsistent lengths)lengthlen	Exceptionlistalphabet)selfr   valuesletter r   0lib/python3.7/site-packages/Bio/motifs/matrix.py__init__!   s    

zGenericPositionMatrix.__init__c             C   sx   dd t | jD }dd| }|g}x<| jD ]2}dd | | D }d| d| }|| q0W d|d }|S )zPReturn a string containing nucleotides and counts of the alphabet in the Matrix.c             S   s   g | ]}d | qS )z%6dr   ).0ir   r   r   
<listcomp>.   s    z1GenericPositionMatrix.__str__.<locals>.<listcomp>z    c             S   s   g | ]}d | qS )z%6.2fr   )r   valuer   r   r   r   2   s    z%c: 
)ranger   joinr   append)r   Zwordslinelinesr   textr   r   r   __str__,   s    zGenericPositionMatrix.__str__c                sR  t |tr>t|dkr|\}}t |trf|t j\}}}t|||} fdd|D }d}	nrt |tr j| }
d}	nXt |tr fdd|D }d}	n6t |trt|dkr|}
d}	qt	|nt	d| t |tr| j
\}}}t|||}d}n"t |tr|}d}nt	d| |	dkrN|dkrNt |
| S |	dkr|dkrt |
tfdd|D S |	dkr|dkri }x"|D ]}
t |
| ||
< qW |S i }x0|D ](}
t |
fd	d|D ||
< qW t| jkr  j|S |S n t|dkr6|d
 }nt	dt |tr|t j\}}}t|||} fdd|D }d}nzt |tr j| }d}n^t |trć fdd|D }d}n:t |trt|dkr|}d}nt	|nt	d| |dkrt |S |dkrFi }x|D ]}t |||< q(W |S tddS )z(Return the position matrix of index key.   c                s   g | ]} j | qS r   )r   )r   r   )r   r   r   r   @   s    z5GenericPositionMatrix.__getitem__.<locals>.<listcomp>r   c                s   g | ]} j | qS r   )r   )r   r   )r   r   r   r   F   s    zCannot understand key %sc             3   s   | ]} | V  qd S )Nr   )r   index2)r   r   r   	<genexpr>]   s    z4GenericPositionMatrix.__getitem__.<locals>.<genexpr>c                s   g | ]} | qS r   r   )r   _)r   r   r   r   g   s    r   z"keys should be 1- or 2-dimensionalc                s   g | ]} j | qS r   )r   )r   r   )r   r   r   r   s   s    c                s   g | ]} j | qS r   )r   )r   r   )r   r   r   r   y   s    zShould not get hereN)
isinstancetupler   sliceindicesr   r   intstrKeyErrorr   dict__getitem__sorted	__class__RuntimeError)r   keyZkey1Zkey2Zstart1Zstop1Zstride1Zindices1Zletters1Zdim1Zletter1Zstart2Zstop2Zstride2Zindices2Zdim2r    dstartstopZstrider&   lettersZdimr   r   )r   r   r   r+   8   s    













z!GenericPositionMatrix.__getitem__c             C   s\   d}xNt | jD ]@}tj }x*| jD ] }| | | }||kr$|}|}q$W ||7 }qW t|S )zReturn the consensus sequence. )r   r   mathinfr   r   )r   sequencer   Zmaximumr   countsequence_letterr   r   r   	consensus   s    zGenericPositionMatrix.consensusc             C   sZ   d}xLt | jD ]>}tj}x*| jD ] }| | | }||k r"|}|}q"W ||7 }qW t|S )z"Return the anticonsensus sequence.r4   )r   r   r5   r6   r   r   )r   r7   r   Zminimumr   r8   r9   r   r   r   anticonsensus   s    z#GenericPositionMatrix.anticonsensusc                s  ddddddddd	d
dddddd}d}xt jD ]މ  fdd}t|dd} fdd|D }|d t|dd kr|d d|d  kr|d }n`dt|dd  dt| krdt|dd }n(|d dkrdt|dd }nd}|||}||7 }q4W t|S )z)Return the degenerate consensus sequence.ACGTMRWSYKVHDBN)r<   r=   r>   r?   ZACZAGATCGZCTZGTZACGZACTZAGTZCGTACGTr4   c                s   |    S )Nr   )
nucleotide)r   r   r   r   get   s    z7GenericPositionMatrix.degenerate_consensus.<locals>.getT)r/   reversec                s   g | ]}|   qS r   r   )r   c)r   r   r   r   r      s    z>GenericPositionMatrix.degenerate_consensus.<locals>.<listcomp>r   r   Nr         rM   )r   r   r,   sumr   rO   r   )r   Zdegenerate_nucleotider7   rO   Znucleotidescountsr/   rN   r   )r   r   r   degenerate_consensus   s<    	,
 z*GenericPositionMatrix.degenerate_consensusc             C   s`   | j }d}d}xHt| jD ]:}x4|D ],}|dkr@|| | | 7 }|| | | 7 }q$W qW || S )z Compute the fraction GC content.g        rL   )r   r   r   )r   r   Zgc_totaltotalr   r   r   r   r   
gc_content   s    
z GenericPositionMatrix.gc_contentc             C   s   i }| j dkr<| d ddd |d< | d ddd |d< n,| d ddd |d< | d ddd |d< | d ddd |d< | d ddd |d< | j }| ||S )	zCompute reverse complement.ZACGUUNr<   r?   r=   r>   )r   r-   )r   r   r   r   r   r   reverse_complement   s    
z(GenericPositionMatrix.reverse_complementN)__name__
__module____qualname____doc__r   r   r+   propertyr:   r;   rV   rX   r[   r   r   r   r   r      s   U.r   c               @   s   e Zd ZdZdddZdS )FrequencyPositionMatrixzGClass for the support of frequency calculations on the Position Matrix.Nc             C   s   i }|dkr.xx| j D ]}dg| j ||< qW nXt|trbxL| j D ]}t|| g| j ||< q@W n$x"| j D ]}t|g| j ||< qjW x>t| jD ]0}x*| j D ] }|| |  | | | 7  < qW qW t| j |S )a  Create and return a position-weight matrix by normalizing the counts matrix.

        If pseudocounts is None (default), no pseudocounts are added
        to the counts.

        If pseudocounts is a number, it is added to the counts before
        calculating the position-weight matrix.

        Alternatively, the pseudocounts can be a dictionary with a key
        for each letter in the alphabet associated with the motif.
        Ng        )r   r   r#   r*   floatr   PositionWeightMatrix)r   ZpseudocountsrU   r   r   r   r   r   	normalize   s    
$z!FrequencyPositionMatrix.normalize)N)r\   r]   r^   r_   rd   r   r   r   r   ra      s   ra   c               @   s"   e Zd ZdZdd ZdddZdS )rc   zDClass for the support of weight calculations on the Position Matrix.c                s~   t || xLtjD ]> t fdd|D }x |D ]}|    |  < q<W qW x|D ]}t| |< qbW dS )zInitialize the class.c             3   s   | ]}t |   V  qd S )N)rb   )r   r   )r   r   r   r   r!     s    z0PositionWeightMatrix.__init__.<locals>.<genexpr>N)r   r   r   r   rT   r$   )r   r   rU   rW   r   r   )r   r   r   r     s    

zPositionWeightMatrix.__init__Nc             C   s   i }| j }|dkr"t| j d}nt|}t| }x$|D ]}||  |  < g ||< q<W xt| jD ]}x||D ]t}|| }|dkr| | | }|dkrt|| d}	qtj	 }	n"| | | }|dkrtj	}	ntj
}	|| |	 qrW qhW t||}
|
S )a/  Return the Position-Specific Scoring Matrix.

        The Position-Specific Scoring Matrix (PSSM) contains the log-odds
        scores computed from the probability matrix and the background
        probabilities. If the background is None, a uniform background
        distribution is assumed.
        Ng      ?r   r   )r   r*   fromkeysrT   r   r   r   r5   logr6   nanr   PositionSpecificScoringMatrix)r   
backgroundr   r   rW   r   r   bplogoddspssmr   r   r   log_odds   s0    



zPositionWeightMatrix.log_odds)N)r\   r]   r^   r_   r   rn   r   r   r   r   rc     s   
rc   c               @   st   e Zd ZdZdd ZdddZed	d
 Zedd Zedd Z	dddZ
dddZdd Zdd ZdddZdS )rh   zGClass for the support of Position Specific Scoring Matrix calculations.c                s  t  jddddgkr$td j yt|}W nz tk
r   yt|d}W n: tk
rl   tddY n tk
r   td	dY nX Y n tk
r   tddY nX t|} j}t	
|| d
 t	j}t	 fddt|D t}t||| t|d
kr|d S |S dS )ag  Return the PWM score for a given sequence for all positions.

        Notes:
         - the sequence can only be a DNA sequence
         - the search is performed only on one strand
         - if the sequence and the motif have the same length, a single
           number is returned
         - otherwise, the result is a one-dimensional numpy array

        r<   r=   r>   r?   z6PSSM has wrong alphabet: %s - Use only with DNA motifsASCIIzBsequence should be a Seq, MutableSeq, string, or bytes-like objectNz-sequence should contain ASCII characters onlyr   c                s    g | ]  fd ddD qS )c                s   g | ]}|   qS r   r   )r   r   )r   r   r   r   r   u  s    zFPositionSpecificScoringMatrix.calculate.<locals>.<listcomp>.<listcomp>rM   r   )r   )r   )r   r   r   u  s    z;PositionSpecificScoringMatrix.calculate.<locals>.<listcomp>r   )r,   r   
ValueErrorbytes	TypeErrorUnicodeEncodeErrorr	   r   r   npemptyZfloat32Zarrayr   rb   r   	calculate)r   r7   nmZscoresrl   r   )r   r   rv   I  s:    
z'PositionSpecificScoringMatrix.calculate        T@B c             c   s"  |  }t|}| j}td||}|r0|  }x|D ]}	||	|	| | d  }
| |
}||k}t|d |	 }|| }|r||
}||k}t|d |	 }|| }ntjdt	d}tjdt	d}t
||| }t
||}tt
||}|| }|| }t||E dH  q6W dS )zFind hits with PWM score above given threshold.

        A generator function, returning found hits in the given sequence
        with the pwm score higher than the threshold.
        r   r   )ZdtypeN)upperr   r   rt   Zaranger[   rv   whereru   r'   r   Zargsortzip)r   r7   Z	thresholdZbothZ	chunksizeZseq_lenZmotif_lZchunk_startsZrcZchunk_startZsubseqZ
pos_scoresZpos_indZpos_positionsZ
neg_scoresZneg_indZneg_positionsZchunk_positionsZchunk_scoresorderr   r   r   search~  s2    



z$PositionSpecificScoringMatrix.searchc                s@   d}j }x0tdjD ]  |t fdd|D 7 }qW |S )zoMaximal possible score for this motif.

        returns the score computed for the consensus sequence.
        g        r   c             3   s   | ]}|   V  qd S )Nr   )r   r   )positionr   r   r   r!     s    z4PositionSpecificScoringMatrix.max.<locals>.<genexpr>)r   r   r   max)r   scorer3   r   )r   r   r   r     s
     z!PositionSpecificScoringMatrix.maxc                s@   d}j }x0tdjD ]  |t fdd|D 7 }qW |S )zsMinimal possible score for this motif.

        returns the score computed for the anticonsensus sequence.
        g        r   c             3   s   | ]}|   V  qd S )Nr   )r   r   )r   r   r   r   r!     s    z4PositionSpecificScoringMatrix.min.<locals>.<genexpr>)r   r   r   min)r   r   r3   r   )r   r   r   r     s
     z!PositionSpecificScoringMatrix.minc             C   s   t ddS )zCompute the GC-ratio.z,Cannot compute the %GC composition of a PSSMN)r	   )r   r   r   r   rX     s    z(PositionSpecificScoringMatrix.gc_contentNc       	      C   s   |dkrt | jd}nt |}t| }x| jD ]}||  |  < q4W d}xrt| jD ]d}x^| jD ]T}| ||f }t|rqht	|r|dk rqh|| }|t
d| }||| 7 }qhW q\W |S )z.Return expected value of the score of a motif.Ng      ?g        r   r   )r*   re   r   rT   r   r   r   r5   isnanisinfpow)	r   ri   rW   r   sxr   rl   rj   rk   r   r   r   mean  s$    
z"PositionSpecificScoringMatrix.meanc             C   s  |dkrt | jd}nt |}t| }x| jD ]}||  |  < q4W d}xt| jD ]}d}d}xn| jD ]d}| ||f }t|rqpt	|r|dk rqp|| }	|	t
d| }
||
| 7 }||
| | 7 }qpW ||| 8 }||7 }q\W t|d}t|S )z2Return standard deviation of the score of a motif.Ng      ?g        r   r   )r*   re   r   rT   r   r   r   r5   r   r   r   r   sqrt)r   ri   rW   r   Zvariancer   r   sxxrl   rj   rk   r   r   r   std  s0    

z!PositionSpecificScoringMatrix.stdc             C   sx   | j |j krtdd}xRt| j d |jD ]:}|dk rJ| || }n|| |}||k r.|}| }q.W d| |fS )zReturn the similarity score based on pearson correlation for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        z.Cannot compare motifs with different alphabetsr   r   )r   rp   r   r   dist_pearson_at)r   otherZmax_poffsetrk   Zmax_or   r   r   dist_pearson  s    
z*PositionSpecificScoringMatrix.dist_pearsonc                s8  j }d}d}d}d}d}tj j t| }	xttj  jD ] fdd|D }
fdd|D }|t|
7 }|t|7 }|tdd |
D 7 }|tdd t|
|D 7 }|tdd |D 7 }qLW ||	 }||	 }||	 }||	 }||	 }|||  }t	|||  |||   }|| S )	zMReturn the similarity score based on pearson correlation at the given offset.g        c                s   g | ]}|  f qS r   r   )r   r   )r   posr   r   r   r     s    zAPositionSpecificScoringMatrix.dist_pearson_at.<locals>.<listcomp>c                s   g | ]} |f qS r   r   )r   r   )r   r   r   r   r     s    c             s   s   | ]}|| V  qd S )Nr   )r   xr   r   r   r!     s    z@PositionSpecificScoringMatrix.dist_pearson_at.<locals>.<genexpr>c             s   s   | ]\}}|| V  qd S )Nr   )r   r   yr   r   r   r!     s    c             s   s   | ]}|| V  qd S )Nr   )r   r   r   r   r   r!     s    )
r   r   r   r   r   r   rT   r}   r5   r   )r   r   r   r3   r   Zsyr   ZsxyZsyyZnormZxiZyi	numeratordenominatorr   )r   r   r   r   r   r     s.    z-PositionSpecificScoringMatrix.dist_pearson_at  c             C   sf   ddl m} |dkr$t| jd}nt|}t| }x| jD ]}||  |  < q@W ||| |dS )z@Calculate the distribution of the scores at the given precision.r   )ScoreDistributionNg      ?)	precisionrm   ri   )Z
thresholdsr   r*   re   r   rT   r   )r   ri   r   r   rW   r   r   r   r   distribution  s    z*PositionSpecificScoringMatrix.distribution)ry   Trz   )N)N)Nr   )r\   r]   r^   r_   rv   r   r`   r   r   rX   r   r   r   r   r   r   r   r   r   rh   F  s   5
!

rh   )r_   r5   Znumpyrt   ImportErrorZBior   ZBio.Seqr   r4   r   r*   r   ra   rc   rh   r   r   r   r   <module>   s    V 3