B
    b.                 @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 G dd deZd	d
 Zd3ddZdd Zd4ddZdd Zdd Zdd Zdd Zdd Zdd Zd5d!d"Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zed1krdd2lm Z  e   dS )6zCode for dealing with coding sequence.

CodonSeq class is inherited from Seq class. This is the core class to
deal with sequences in CodonAlignment in biopython.

    )permutations)log)Seq)	SeqRecord)
CodonTablec               @   sf   e Zd ZdZdddZdd Zd	d
 ZdddZdd Zdd Z	dddZ
dddZedddZdS )CodonSeqaK  CodonSeq is designed to be within the SeqRecords of a CodonAlignment class.

    CodonSeq is useful as it allows the user to specify
    reading frame when translate CodonSeq

    CodonSeq also accepts codon style slice by calling
    get_codon() method.

    **Important:** Ungapped CodonSeq can be any length if you
    specify the rf_table. Gapped CodonSeq should be a
    multiple of three.

    >>> codonseq = CodonSeq("AAATTTGGGCCAAATTT", rf_table=(0,3,6,8,11,14))
    >>> print(codonseq.translate())
    KFGAKF

    test get_full_rf_table method

    >>> p = CodonSeq('AAATTTCCCGG-TGGGTTTAA', rf_table=(0, 3, 6, 9, 11, 14, 17))
    >>> full_rf_table = p.get_full_rf_table()
    >>> print(full_rf_table)
    [0, 3, 6, 9, 12, 15, 18]
    >>> print(p.translate(rf_table=full_rf_table, ungap_seq=False))
    KFPPWV*
    >>> p = CodonSeq('AAATTTCCCGGGAA-TTTTAA', rf_table=(0, 3, 6, 9, 14, 17))
    >>> print(p.get_full_rf_table())
    [0, 3, 6, 9, 12.0, 15, 18]
    >>> p = CodonSeq('AAA------------TAA', rf_table=(0, 3))
    >>> print(p.get_full_rf_table())
    [0, 3.0, 6.0, 9.0, 12.0, 15]

     -Nc             C   s   t | |  || _|dkrXt| }|d dkr:tdttd|| | d| _	n6t
|ttfsntdtdd |D std|| _	dS )	zInitialize the class.N   r   zJSequence length is not a multiple of three (i.e. a whole number of codons)z)rf_table should be a tuple or list objectc             s   s   | ]}t |tV  qd S )N)
isinstanceint).0i r   6lib/python3.7/site-packages/Bio/codonalign/codonseq.py	<genexpr>V   s    z$CodonSeq.__init__.<locals>.<genexpr>zSElements in rf_table should be int that specify the codon positions of the sequence)r   __init__uppergap_charlen
ValueErrorlistrangecountrf_tabler   tuple	TypeErrorall)selfdatar   r   lengthr   r   r   r   6   s    zCodonSeq.__init__c                s   t dd jD dkr tdt|trd|dkrNt|d |d d  S t|d d S n.tt d   fdd	}||}t|S dS )
z&Get the index codon from the sequence.c             S   s   h | ]}|d  qS )r
   r   )r   r   r   r   r   	<setcomp>`   s    z%CodonSeq.get_codon.<locals>.<setcomp>   z}frameshift detected. CodonSeq object is not able to deal with codon sequence with frameshift. Please use normal slice option.r
   Nc                s>    |  }d}x(|D ] }||d |d d  7 }qW t |S )Nr   r
   )str)pZaa_slicecodon_slicer   )aa_indexr   r   r   cslices   s
    
 z"CodonSeq.get_codon.<locals>.cslice)r   r   RuntimeErrorr   r   r$   r   r   )r   indexr(   r&   r   )r'   r   r   	get_codon^   s    
zCodonSeq.get_codonc             C   s
   t | jS )z,Return the number of codons in the CodonSeq.)r   r   )r   r   r   r   get_codon_num}   s    zCodonSeq.get_codon_num*Tc       
   	   C   sP  |dkrt jd }g }|r.t| | jd}nt| }|dkrD| j}d}x|D ]}t|trj|d qNnd|||d  kr|dks|| dkr|}|||d  dddd }	q|| dkr|||d  }	|}n|||d  }	|}|	|j	kr|| qNy||j
|	  W qN tk
r@   td|	 d	Y qNX qNW d|S )
a1  Translate the CodonSeq based on the reading frame in rf_table.

        It is possible for the user to specify
        a rf_table at this point. If you want to include
        gaps in the translated sequence, this is the only
        way. ungap_seq should be set to true for this
        purpose.
        Nr"   r   r#   r	   r
      zUnknown codon detected (z4). Did you forget to specify the ungap_seq argument?)r   generic_by_idr$   replacer   r   r   floatappendstop_codonsforward_tableKeyErrorr)   join)
r   codon_tablestop_symbolr   	ungap_seqZamino_acidsZtr_seqr%   r   codonr   r   r   	translate   s>    



"
zCodonSeq.translatec             C   s   t t| S )zConvert DNA to seq object.)r   r$   )r   r   r   r   toSeq   s    zCodonSeq.toSeqc          	   C   s  t | dd}| jd g}xBtdt| jdd d D ]"}|| j| | j|d    q:W g }d}xPtdt| dD ]:}| ||d  | jd kr||d  n|| dkr|| |d7 }n|| dkrZd| d|d | }|dkr||||   nB|d	kr0||d ||   n |dkrP||d	 ||   |d7 }n|| dkrv||d  y*d| d||d  }||  |8  < W q| tk
r   Y q|X q|W |S )
zReturn full rf_table of the CodonSeq records.

        A full rf_table is different from a normal rf_table in that
        it translate gaps in CodonSeq. It is helpful to construct
        alignment containing frameshift.
        r	   r   r   r"   Nr
   g        )r#      )	r$   r0   r   r   r   r2   r   r   	Exception)r   r9   Zrelative_posr   full_rf_table	codon_numZgap_statZthis_lenr   r   r   get_full_rf_table   s:    ""






zCodonSeq.get_full_rf_tablec             C   s,   |dkrt jd }|  }| j|||ddS )z,Apply full translation with gaps considered.Nr"   F)r7   r8   r   r9   )r   r/   rB   r;   )r   r7   r8   r@   r   r   r   full_translate   s    
zCodonSeq.full_translatec             C   s@   t |dkst|ts&tdt| tt| |d| jdS )z;Return a copy of the sequence without the gap character(s).r"   zUnexpected gap character, %sr   )r   )r   r   r$   r   reprr   r0   r   )r   Zgapr   r   r   ungap   s    zCodonSeq.ungapc             C   s(   |dkr| t |S | t ||dS dS )z&Get codon sequence from sequence data.N)r   )r$   )clsseqr   r   r   r   from_seq   s    zCodonSeq.from_seq)r   r	   N)Nr-   NT)Nr-   )r	   )N)__name__
__module____qualname____doc__r   r+   r,   r;   r<   rB   rC   rE   classmethodrH   r   r   r   r   r      s    
(
3'

r   c          	   C   s   |   }g }xt|D ]\}}t|tr|}yt||d  }W n tk
r\   |d }Y nX t| || }t|dkr|| q|t|  qt| t|t|d  dkr|d q|| t|t|d   qW |S )zAList of codons according to full_rf_table for counting (PRIVATE).r"   r
   z---)	rB   	enumerater   r   
IndexErrorr$   r   r2   rE   )Zcodonseqr@   	codon_lstr   kstartend
this_codonr   r   r   _get_codon_list   s"    
 "rU   NG86Nr"   c             C   sv  t | trt |trn*t | tr8t |tr8| j} |j}ntdt|  t| kr~tdt|   dt|  d|dkrd}n|dk	r|dkrtd|d	krd
dl}|	d| d d}|dkrt
jd }t| }t|}g }	g }
x@t||D ]2\}}d|kr d|kr |	| |
| q W ttttd}|dkr`|| |	|
||S || |	|
||S dS )a  Calculate dN and dS of the given two sequences.

    Available methods:
        - NG86  - `Nei and Gojobori (1986)`_ (PMID 3444411).
        - LWL85 - `Li et al. (1985)`_ (PMID 3916709).
        - ML    - `Goldman and Yang (1994)`_ (PMID 7968486).
        - YN00  - `Yang and Nielsen (2000)`_ (PMID 10666704).

    .. _`Nei and Gojobori (1986)`: http://www.ncbi.nlm.nih.gov/pubmed/3444411
    .. _`Li et al. (1985)`: http://www.ncbi.nlm.nih.gov/pubmed/3916709
    .. _`Goldman and Yang (1994)`: http://mbe.oxfordjournals.org/content/11/5/725
    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236

    Arguments:
     - codon_seq1 - CodonSeq or or SeqRecord that contains a CodonSeq
     - codon_seq2 - CodonSeq or or SeqRecord that contains a CodonSeq
     - w  - transition/transversion ratio
     - cfreq - Current codon frequency vector can only be specified
       when you are using ML method. Possible ways of
       getting cfreq are: F1x4, F3x4 and F61.

    zVcal_dn_ds accepts two CodonSeq objects or SeqRecord that contains CodonSeq as its seq!zfull_rf_table length of seq1 (z) and seq2 (z) are not the sameNF3x4MLz8cfreq can only be specified when you are using ML method)F1x4rW   F61r   zUnknown cfreq (zF). Only F1x4, F3x4 and F61 are acceptable. Used F3x4 in the following.r"   r	   )rX   rV   ZLWL85ZYN00)r   r   r   rG   r   r   rB   r)   warningswarnr   r/   rU   zipr2   _ml_ng86_lwl85_yn00)Z
codon_seq1Z
codon_seq2methodr7   rQ   Zcfreqr[   Zseq1_codon_lstZseq2_codon_lstseq1seq2r   jZ	dnds_funcr   r   r   	cal_dn_ds  sB    $


rf   c          	   C   s   t | ||d\}}t |||d\}}|| d }|| d }	ddg}
x4t| |D ]&\}}dd t|
t|||dD }
qPW |
d | }|
d |	 }|dk rtd	tdd
|   }nd}|dk rtd	tdd
|   }nd}||fS )z$NG86 method main function (PRIVATE).)r7   rQ   g       @r   c             S   s   g | ]\}}|| qS r   r   )r   mnr   r   r   
<listcomp>k  s    z_ng86.<locals>.<listcomp>)r7   r"   g      ?g      gUUUUUU?r#   )_count_site_NG86r]   _count_diff_NG86absr   )rc   rd   rQ   r7   S_sites1N_sites1S_sites2N_sites2S_sitesN_sitesSNr   re   pspndSdNr   r   r   r_   b  s     "r_   c             C   s  d}d}d}d}d}x| D ]}g g d}	| dd}|dkrBqxt|D ]\}
}x|D ]}||krhqZ||kr||krt|}|||
< d	|}|	d
 | qZ||kr||krt|}|||
< d	|}|	d
 | qZt|}|||
< d	|}|	d | qZW qLW |j| }d }}xJ|	d
 D ]>}||jkrB|d7 }n"|j| |kr\|d7 }n|d7 }q(W xJ|	d D ]>}||jkr||7 }n"|j| |kr||7 }n||7 }qtW || d }||| 7 }||| 7 }qW ||fS )a  Count synonymous and non-synonymous sites of a list of codons (PRIVATE).

    Arguments:
     - codon_lst - A three letter codon list from a CodonSeq object.
       This can be returned from _get_codon_list method.
     - k - transition/transversion rate ratio.

    r   )AG)TC)rx   rz   r{   ry   )
transitiontransversionUrz   z---r   r|   r}   r"   r
   )r0   rN   r   r6   r2   r4   r3   )rP   r7   rQ   ZS_siteZN_sitepurine
pyrimidine
base_tupler:   neighbor_codonrh   r   re   Zcodon_charsrT   aaZthis_codon_N_siteZthis_codon_S_siteZneighbor
norm_constr   r   r   rj   {  s\    	









rj   c          
      s  t | trt |ts2tdt|  dt| dt| dksJt|dkrhtdt|  dt| dddg}| dks|dkr|S d t fd	d
| D std|  dt fdd
|D std| d| |kr|S g }x4tt| |D ]"\}}|d |d kr|	| qW d dd}t|dkrTdd t||| ||dD }nt|dkrxr|D ]j}| d| ||  | |d d  }dd t||| ||ddD }dd t|||||ddD }qhW nt|dkrt
tdddgd}	g }
x|	D ]}| d|d  ||d   | |d d d  }|d|d  ||d   ||d d d  }|
	||f dd t||| ||ddD }dd t|||||ddD }dd t|||||ddD }qW |S )!zCount differences between two codons, three-letter string (PRIVATE).

    The function will take multiple pathways from codon1 to codon2
    into account.
    z;_count_diff_NG86 accepts string object to represent codon (z, z
 detected)r
   z%codon should be three letter string (r   z---)rx   r{   ry   rz   c             3   s   | ]}| kV  qd S )Nr   )r   r   )r   r   r   r     s    z#_count_diff_NG86.<locals>.<genexpr>z*Unrecognized character detected in codon1 z! (Codons consist of A, T, C or G)c             3   s   | ]}| kV  qd S )Nr   )r   r   )r   r   r   r     s    z*Unrecognized character detected in codon2 r"   c             S   s@   d }}t tt|jj| |gdkr0||7 }n||7 }||fS )z4Compare two codon accounting for different pathways.r   r"   )r   setmapr4   get)codon1codon2r7   weightZsdZndr   r   r   compare_codon  s
    
z'_count_diff_NG86.<locals>.compare_codonc             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   z$_count_diff_NG86.<locals>.<listcomp>)r7   r>   Nc             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   g      ?)r7   r   c             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   c             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   gUUUUUU?)r   c             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   c             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri     s   )r"   )r   r$   r   typer   r)   r   rN   r]   r2   r   r   )r   r   r7   rs   diff_posr   rQ   r   
temp_codonpaths	tmp_codonr%   tmp1tmp2r   )r   r   rk     sr    
	
$
00rk   c          	   C   s  t |}ddg}ddg}ddg}xr| | D ]f}|| }	xX|	D ]P}
|
dkrZ|d  d7  < q<|
dkrt|d  d7  < q<|
dkr<|d  d7  < q<W q*W t|d t|d t|d g}dgd }xPt| |D ]B\}}|dks|dks||krqqd	d
 t|t|||dD }qW dd
 t||d D }|dd }|dd }dd
 t||D }dd
 |D }d|d |d  |d |d |d     |d d|d    }d|d |d  |d |d |d     d|d  d|d    }||fS )zlLWL85 method main function (PRIVATE).

    Nomenclature is according to Li et al. (1985), PMID 3916709.
    r   0r"   24g       @r.   z---c             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri   @  s   z_lwl85.<locals>.<listcomp>)	fold_dictc             S   s   g | ]\}}|| qS r   r   )r   r   re   r   r   r   ri   E  s    r>   Nr
   c          	   S   sD   g | ]<\}}d t ddd|  |   dt ddd|     qS )g      ?g      ?r"   r>   g      ?)r   )r   r   re   r   r   r   ri   I  s   c             S   s$   g | ]}d t ddd|    qS )g      ?g      ?r"   r>   )r   )r   r   r   r   r   ri   L  s    )_get_codon_foldsumr]   _diff_codon)rc   rd   rQ   r7   codon_fold_dictZfold0Zfold2Zfold4r:   fold_numfLZPQr   r   PQrx   Brv   rw   r   r   r   r`   %  s<    
"
@Dr`   c             C   s@   dd }i }x&| j D ]}d|kr||| j ||< qW d|d< |S )zFClassify different position in a codon into different folds (PRIVATE).c       
   
   S   s   ddddh}d}t | }xt|D ]\}}|t| }g }xL|D ]D}	|	||< y||d|  W q@ tk
r   |d Y q@X q@W |||  dkr|d7 }n@|||  d	kr|d
7 }n$|||  dkr|d7 }ntd|||< q"W |S )Nrx   rz   r{   ry   r   stopr   r   )r"   r>   r   r
   r   z3Unknown Error, cannot assign the position to a fold)r   rN   r   r2   r6   r5   r   r)   )
r:   r4   baseZfoldZcodon_base_lstr   bZ
other_baser   re   r   r   r   find_fold_classU  s,    



z(_get_codon_fold.<locals>.find_fold_classr~   z---)r4   )r7   r   Z
fold_tabler:   r   r   r   r   R  s    r   c             C   s  d } } } } }}||  }	d}
d}xt t| |D ]l\}\}}||kr||
kr||
kr|	| dkrv|d7 }n<|	| dkr|d7 }n&|	| dkr|d7 }ntd|	|  ||kr$||kr$||kr$|	| dkr|d7 }n>|	| dkr|d7 }n(|	| dkr|d7 }ntd|	|  ||kr:||
kr@||ksP||kr:||
kr:|	| dkrh|d7 }q:|	| dkr|d7 }q:|	| dkr|d7 }q:td|	|  q:W ||||||fS )	zCount number of different substitution types between two codons (PRIVATE).

    returns tuple (P0, P2, P4, Q0, Q2, Q4)

    Nomenclature is according to Li et al. (1958), PMID 3916709.
    r   )rx   ry   )rz   r{   r   r"   r   r   zUnexpected fold_num %d)rN   r]   r)   )r   r   r   ZP0ZP2ZP4ZQ0ZQ2ZQ4r   r   r   rh   r   re   r   r   r   r   w  s>     





$


r   c       +   	      sr  ddl m} ddlm} dddddddddddddddg}t|}|t}|t}	x| | D ]}
|
dkr|d |
d   d7  < |d |
d   d7  < |d |
d   d7  < ||
 }xNt|D ]B\}|dkr||
   d7  < q|d	kr|	|
   d7  < qW qbW t| }t|	 }x8t	||	D ]*\}| | |< |	 | |	< q6W t
| ||d
}t||t|	|f}||d  ||d   ||  }x@tdD ]4t|  fdd|  D |< qW |t}x0t|j |j D ]dkrd|< qW x"| | D ]|  d7  < q,W t| ||||d\}}}t|| |||d\}}}|| d }|| d }ddddddddddg}xFtdD ]:x2dD ]*}| | | |  d | |< qW qW ddg}x6t	| |D ](\}dd t	|t||d
D }qW |d | }|d | } t|||  }!tdd|   tdd|   }"dtdd|!   }#dddg}$xtdD ]}%dd t|j |j D }&t|||"|&|}'||'|# }(ddddg}i  xPt	| |D ]B\}dkr|dkr |fd  |f  d7  < qW x@ D ]8td d |(|&|}) fddt	||)D }qNW |d | |d | f|d | |d | ff}g }*x,t	||D ]\}})|*t||)dd qW |*d d | ||  |*d d | ||   }#|*d |*d  }"ttfdddd t	|*|$D rd|*d |*d fS |*}$qW dS ) zsYN00 method main function (PRIVATE).

    Nomenclature is according to Yang and Nielsen (2000), PMID 10666704.
    r   )defaultdict)expm)rx   ry   r{   rz   z---r"   r>   r   r   )r7   r
   c                s   i | ]\}}|  |qS r   r   )r   re   rQ   )totr   r   
<dictcomp>  s    z_yn00.<locals>.<dictcomp>r~   )rQ   r7   )rx   rz   r{   ry   c             S   s   g | ]\}}|| qS r   r   )r   rg   rh   r   r   r   ri     s    z_yn00.<locals>.<listcomp>gUUUUUU?g      gh㈵>   c             S   s   g | ]}d |kr|qS )r~   r   )r   r   r   r   r   ri     s   c                s    g | ]\}}||    qS r   r   )r   rg   rh   )codon_npathr   r   r   ri     s    T)tc                s   |  k S )Nr   )x)	tolerancer   r   <lambda>      z_yn00.<locals>.<lambda>c             s   s   | ]\}}t || V  qd S )N)rl   )r   r   re   r   r   r   r     s    z_yn00.<locals>.<genexpr>N)collectionsr   scipy.linalgr   r   r   rN   r   valuesr]   _get_TV_get_kappa_tr   itemsr   r4   keysr3   _count_site_YN00rk   r   _get_Q
setdefault_count_diff_YN00r2   r   r   )+rc   rd   rQ   r7   r   r   fcodonr   Z	fold0_cntZ	fold4_cntr:   r   r   Zf0_totalZf4_totalre   TVZk04Zkappapirm   rn   ZbfreqSN1ro   rp   ZbfreqSN2rr   rq   ZbfreqSNr   rs   rt   ru   r%   wr   ZdSdN_preZtemprP   r   r   ZtvZdSdNr   )r   r   r   r   r   ra     s     $

0$ 
 0$&ra   c             C   s   d}d}ddg}d}xt | |D ]\}}d||fkr x|t ||D ]n\}	}
|	|
krRnT|	|krt|
|krt|d  d7  < n2|	|kr|
|kr|d  d7  < n|d  d7  < |d7 }q@W q W |d | |d | fS )zGet TV (PRIVATE).

    Arguments:
     - T - proportions of transitional differences
     - V - proportions of transversional differences

    )rx   ry   )r{   rz   r   z---r"   )r]   )
codon_lst1
codon_lst2r7   r   r   r   Zsitesr   r   r   re   r   r   r   r     s     r   Fc       	      C   s  | d | d  | d< | d | d  | d< d| d | d  | d | d    d| d | d  | d  | d  | d | d  | d  | d    d|d d| d  | d      |d	  d| d | d  | d  | d | d  | d     }d|d d| d  | d    }d
t | }d
t | }|| d }|dkrd| d | d  | d  | d | d  | d   | | d | d  | d | d     }|S d| d  | d  d|| d    d| d  | d  d|| d     d| d  | d   | }|S dS )zmCalculate kappa (PRIVATE).

    The following formula and variable names are according to PMID: 10666704
    rz   r{   Yrx   ry   Rr>   r"   r   g      F   N)r   )	r   r   r   rx   r   ar   ZkappaF84Z
kappaHKY85r   r   r   r   3  s    	6 
VZr   c             C   s`  t | t |kr*tdt | t |f nt | }d}d}d}|j}	|j}
i }xJt| |D ]<\}}|dkrZ|dkrZ|||fd |||f  d7  < qZW d }}ddddddddddg}x8| D ]*\}}|d }d }}xtdD ]}x|D ]}|| |krq|d	| | ||d d	  }||
kr8q|| }|| |krb||krb||9 }n || |kr||kr||9 }|	| |	| kr||7 }|d |  || 7  < q||7 }|d |  || 7  < qW qW ||| 7 }||| 7 }qW d| ||  }||9 }||9 }x:|D ]2}t|	 }x|D ]}||  |  < q6W q W |||fS )
a  Site counting method from Ina / Yang and Nielsen (PRIVATE).

    Method from `Ina (1995)`_ as modified by `Yang and Nielsen (2000)`_.
    This will return the total number of synonymous and nonsynonymous sites
    and base frequencies in each category. The function is equivalent to
    the ``CountSites()`` function in ``yn00.c`` of PAML.

    .. _`Ina (1995)`: https://doi.org/10.1007/BF00167113
    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236

    z?Length of two codon_lst should be the same (%d and %d detected))rx   ry   )rz   r{   )rx   rz   r{   ry   z---r   r"   r
   N)
r   r)   r4   r3   r]   r   r   r   r   r   )r   r   r   rQ   r7   r    r   r   r   Z
codon_dictr   r   r   re   rq   rr   ZfreqSNZ
codon_pairZnpathr:   SNposr   r   r   r   r   r   r   r   r   V  s`    
 

 

r   c                sf  t trt ts2tdt dt dtdksJtdkrhtdt dt dddddg}dksdkr|S d t fd	d
D std dt fdd
D std dkr|S g }x4ttD ]"\}}|d |d kr|	| qW ddd}	t|dkr\dd t||	|d |D }nt|dkrfdd|D }
g xb|
D ]Z}t
t|j|g}||d |d f ||d |d f f}	|d |d   qW fddD xt|D ]\}}d| |  |d d  }dd t||	|||| d dD }dd t||	|||| d dD }qW nt|dkrbt
tdddgd}g g }
x|D ]}d|d  |d   |d d d  }|d|d  |d   ||d d d  }|
	||f t
t|j||g}||d |d f ||d |d f ||d |d f f}	|d |d  |d   qW fddD xt|
|D ]\}}}dd t||	|d |d ||d dD }dd t||	|d |d |d ||d dD }dd t||	|d |d ||d dD }qW |S ) a&  Count differences between two codons (three-letter string; PRIVATE).

    The function will weighted multiple pathways from codon1 to codon2
    according to P matrix of codon substitution. The proportion
    of transition and transversion (TV) will also be calculated in
    the function.
    z;_count_diff_YN00 accepts string object to represent codon (z, z
 detected)r
   z%codon should be three letter string (r   z---)rx   r{   ry   rz   c             3   s   | ]}| kV  qd S )Nr   )r   r   )r   r   r   r     s    z#_count_diff_YN00.<locals>.<genexpr>z*Unrecognized character detected in codon1 z! (Codons consist of A, T, C or G)c             3   s   | ]}| kV  qd S )Nr   )r   r   )r   r   r   r     s    z*Unrecognized character detected in codon2 r"   c       	      S   s@  d}d}|j }|j}| |ks$||krz| | |krH|| |krHdd|dgS | | |krl|| |krldd|dgS ddd|gS n||  || kr| | |kr|| |kr|dddgS | | |kr|| |kr|dddgS d|ddgS n\| | |kr|| |krdd|dgS | | |kr0|| |kr0dd|dgS ddd|gS d S )N)rx   ry   )rz   r{   r   )r4   r3   )	r   r   diffr7   r   r   r   Zdicr   r   r   r   count_TV  s*    z"_count_diff_YN00.<locals>.count_TVc             S   s   g | ]\}}|| qS r   r   )r   r%   qr   r   r   ri     s   z$_count_diff_YN00.<locals>.<listcomp>r>   c                s0   g | ](} d | |   |d d   qS )Nr"   r   )r   r   )r   r   r   r   ri     s    c                s   g | ]}d | t   qS )r>   )r   )r   r   )	path_probr   r   ri     s    Nc             S   s   g | ]\}}|| qS r   r   )r   r%   r   r   r   r   ri     s   )r   c             S   s   g | ]\}}|| qS r   r   )r   r%   r   r   r   r   ri     s   c                s   g | ]}d | t   qS )r
   )r   )r   r   )r   r   r   ri     s    c             S   s   g | ]\}}|| qS r   r   )r   r%   r   r   r   r   ri     s   c             S   s   g | ]\}}|| qS r   r   )r   r%   r   r   r   r   ri     s   c             S   s   g | ]\}}|| qS r   r   )r   r%   r   r   r   r   ri   !  s   )r"   )r   r$   r   r   r   r)   r   rN   r]   r2   r   r   r*   r   )r   r   r   rP   r7   r   r   r   rQ   r   r   Z	codon_idxZprobrh   r   r   r%   r   r   re   r   )r   r   r   r   r   r     s    
 
($(
00$&*,r   c          
   C   s  ddl m} ddlm} | }t| |||d}x6t| |D ](\}}	d||	fkr:|||	f  d7  < q:W dd t|j |j	 D }
|||
|fd	d
}||dddgdddd}|j
\}}}t||||
|}d }}xt|
D ]\}}xt|
D ]t\}	}||	kryL|j| |j| kr0||| |||	f  7 }n||| |||	f  7 }W q tk
r`   Y qX qW qW ||9 }||9 }|||
|fdd}||ddgdddd}|j
\}}d}t||||
|}d }}xt|
D ]\}}xt|
D ]x\}	}||	kryL|j| |j| kr(||| |||	f  7 }n||| |||	f  7 }W n tk
rX   Y nX qW qW |d9 }|d9 }|| }|| }||fS )z"ML method main function (PRIVATE).r   )Counter)minimize)r7   z---r"   c             S   s   g | ]}d |kr|qS )r~   r   )r   r   r   r   r   ri   :  s   z_ml.<locals>.<listcomp>c          	   S   s$   t | d | d | d ||||d S )z'Temporary function, params = [t, k, w].r   r"   r>   )rP   r7   )_likelihood_func)paramsr   	codon_cntrP   r7   r   r   r   func@  s    z_ml.<locals>.funcg?r>   zL-BFGS-B))g|=r   )g|=r   )g|=
   gh㈵>)rb   ZboundsZtolc          	   S   s    t | d | d d||||d S )z5Temporary function, params = [t, k]. w is fixed to 1.r   r"   g      ?)rP   r7   )r   )r   r   r   rP   r7   r   r   r   func_w1j  s    z_ml.<locals>.func_w1))g|=r   )g|=r   g      ?r
   )r   r   Zscipy.optimizer   _get_pir]   r   r4   r   r3   r   r   rN   r5   )rc   rd   cmethodr7   r   r   r   r   r   re   rP   r   Zopt_resr   rQ   r   r   ZSdZNdc1c2r   ZrhoSZrhoNrw   rv   r   r   r   r^   .  sn    

r^   c                s  i }|dkrddddd}x6| | D ]*}|dkr$x|D ]}||  d7  < q6W q$W t |   fdd| D }xH|j |j D ]4}d|kr||d  ||d   ||d	   ||< qW n|d
krdddddddddddddddg}x`| | D ]T}|dkr|d |d   d7  < |d |d   d7  < |d	 |d	   d7  < qW x@tdD ]4}t ||    fdd||  D ||< q`W xt|j |j D ]D}d|kr|d |d  |d |d   |d	 |d	   ||< qW n|dkrx,|j |j D ]}d|krd||< qW x,| | D ] }|dkr:||  d7  < q:W t |   fdd| D }|S )zObtain codon frequency dict (pi) from two codon list (PRIVATE).

    This function is designed for ML method. Available counting methods
    (cfreq) are F1x4, F3x4 and F64.
    rY   r   )rx   ry   r{   rz   z---r"   c                s   i | ]\}}|  |qS r   r   )r   re   rQ   )r   r   r   r     s    z_get_pi.<locals>.<dictcomp>r~   r>   rW   r
   c                s   i | ]\}}|  |qS r   r   )r   re   rQ   )r   r   r   r     s    rZ   g?c                s   i | ]\}}|  |qS r   r   )r   re   rQ   )r   r   r   r     s    )r   r   r   r4   r   r3   r   r   )rc   rd   r   r7   r   r   r   cr   )r   r   r     sJ    	
0
$
<


r   c             C   s|  | |krdS | |j ks ||j kr$dS | |ks4||kr8dS d}d}g }x6tt| |D ]$\}	\}
}|
|krT||	|
|f qTW t|dkrdS |j|  |j| kr|d d |kr|d d |kr|||  S |d d |kr|d d |kr|||  S || S nt|d d |kr8|d d |kr8|| ||  S |d d |krl|d d |krl|| ||  S |||  S dS )a   Q matrix for codon substitution (PRIVATE).

    Arguments:
     - i, j  : three letter codon string
     - pi    : expected codon frequency
     - k     : transition/transversion ratio
     - w     : nonsynonymous/synonymous rate ratio
     - codon_table: Bio.Data.CodonTable object

    r   )rx   ry   )rz   r{   r>   r"   N)r3   rN   r]   r2   r   r4   )r   re   r   rQ   r   r7   r   r   r   rh   r   r   r   r   r   _q  s2      
$$r   c          
   C   s   ddl }t|}|||f}xNt|D ]B}x<t|D ]0}	||	kr6t|| ||	 | |||d|||	f< q6W q(W d}
xft|D ]Z}t||ddf  |||f< y"|
| ||  |||f   7 }
W q| tk
r   Y q|X q|W ||
 }|S )z*Q matrix for codon substitution (PRIVATE).r   N)r7   )Znumpyr   Zzerosr   r   r   r5   )r   rQ   r   rP   r7   ZnprA   r   r   re   Znucl_substitutionsr   r   r   r     s"    *"
r   c          	   C   s   ddl m} t|||||}|||  }	d}
xt|D ]\}}xvt|D ]j\}}||f|krH|	||f ||  dkr|
|||f d 7 }
qH|
|||f t|| |	||f   7 }
qHW q6W |
S )z,Likelihood function for ML method (PRIVATE).r   )r   )r   r   r   rN   r   )r   rQ   r   r   r   rP   r7   r   r   r   Z
likelihoodr   r   re   r   r   r   r   r     s    0r   __main__)run_doctest)rV   Nr"   N)r"   )F)!rL   	itertoolsr   Zmathr   ZBio.Seqr   ZBio.SeqRecordr   ZBio.Datar   r   rU   rf   r_   rj   rk   r`   r   r   ra   r   r   r   r   r^   r   r   r   r   rI   Z
Bio._utilsr   r   r   r   r   <module>   s:    h
J
>l-%1o
#C i32
