B
    ‰°b„  ã               @   s„   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ d
ZG dd„ deƒZddd„ZG dd„ dƒZdS )aS  Bio.AlignIO support for the "maf" multiple alignment format.

The Multiple Alignment Format, described by UCSC, stores a series of
multiple alignments in a single file. It is suitable for whole-genome
to whole-genome alignments, metadata such as source chromosome, start
position, size, and strand can be stored.

See http://genome.ucsc.edu/FAQ/FAQformat.html#format5

You are expected to use this module via the Bio.AlignIO functions(or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

Coordinates in the MAF format are defined in terms of zero-based start
positions (like Python) and aligning region sizes.

A minimal aligned region of length one and starting at first position in the
source sequence would have ``start == 0`` and ``size == 1``.

As we can see on this example, ``start + size`` will give one more than the
zero-based end position. We can therefore manipulate ``start`` and
``start + size`` as python list slice boundaries.

For an inclusive end coordinate, we need to use ``end = start + size - 1``.
A 1-column wide alignment would have ``start == end``.
é    N)Úislice)Údbapi2)ÚMultipleSeqAlignment)ÚSeq)Ú	SeqRecordé   )ÚSequentialAlignmentWriteré   c               @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	Ú	MafWriterz9Accepts a MultipleSeqAlignment object, writes a MAF file.c             C   s   | j  d¡ | j  d¡ dS )zWrite the MAF header.z##maf version=1 scoring=none
z# generated by Biopython

N)ÚhandleÚwrite)Úself© r   ú0lib/python3.7/site-packages/Bio/AlignIO/MafIO.pyÚwrite_header4   s    zMafWriter.write_headerc             C   s¬   |j  d¡dkrd}n|j  d¡dkr,d}nd}dd|j dd	¡ d
|j  dd¡ d|j  dtt|jƒ dd¡ƒ¡ |d
|j  dd¡ t|jƒg}| j dd 	|¡ ¡ dS )zHWrite a single SeqRecord object to an 's' line in a MAF block (PRIVATE).Ústrandr   ú+éÿÿÿÿú-Úsz%-40sú Ú_z%15sÚstartr   z%5sÚsizeÚ ÚsrcSizez%s
N)
ÚannotationsÚgetÚidÚreplaceÚlenÚstrÚseqr   r   Újoin)r   Úrecordr   Zfieldsr   r   r   Ú_write_record9   s     zMafWriter._write_recordc             C   s®   t |tƒstdƒ‚tdd„ |D ƒƒdkr0tdƒ‚yd dd„ |j ¡ D ƒ¡}W n tk
rf   d	}Y nX | j	 
d
|f ¡ d}x|D ]}|  |¡ |d7 }q„W | j	 
d¡ |S )zÅWrite a complete alignment to a MAF block.

        Writes every SeqRecord in a MultipleSeqAlignment object to its own
        MAF block (beginning with an 'a' line, containing 's' lines).
        zExpected an alignment objectc             S   s   h | ]}t |ƒ’qS r   )r    )Ú.0Úxr   r   r   ú	<setcomp>Z   s    z,MafWriter.write_alignment.<locals>.<setcomp>r   z%Sequences must all be the same lengthr   c             S   s$   g | ]\}}|d krd||f ‘qS ))ZscoreÚpassz%s=%sr   )r&   r'   Úyr   r   r   ú
<listcomp>e   s   z-MafWriter.write_alignment.<locals>.<listcomp>z
score=0.00za %s
r   Ú
)Ú
isinstancer   Ú	TypeErrorr    Ú
ValueErrorr#   Ú_annotationsÚitemsÚAttributeErrorr   r   r%   )r   Ú	alignmentÚannoZrecs_outr$   r   r   r   Úwrite_alignmentQ   s"    



zMafWriter.write_alignmentN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r%   r5   r   r   r   r   r
   1   s   r
   c          	   c   sB  d}g }g }x.yt | ƒ}W n tk
r4   d}Y nX |rÒ| d¡rB| ¡  ¡ }t|ƒdkrhtdƒ‚|d dkrzd}n|d d	krŒd
}nd}t|d ƒt|d ƒ|t|d ƒdœ}|d }	d|	kr|sÔtdƒ‚|d j}
g }x.t	|	|
ƒD ] \}}| 
|dkr|n|¡ qîW d |¡}	| 
tt|	ƒ|d |d d|d¡ nŽ| d¡rPn€| d¡r^nr| d¡rlnd| d¡rznV| ¡ sÂ|dk	r t|ƒ|ks t‚t|ƒ}||_|V  d}g }g }ntd|f ƒ‚q| d¡r&d}| ¡  ¡ dd… }t|ƒ| d¡krtdƒ‚tdd„ |D ƒƒ}q| d¡r4q|sP qW dS ) zåIterate over a MAF file handle as MultipleSeqAlignment objects.

    Iterates over lines in a MAF file-like object (handle), yielding
    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
    species names.
    Fr   r   é   z5Error parsing alignment - 's' line must have 7 fieldsé   r   r   r   r   r	   é   é   )r   r   r   r   é   Ú.z/Found dot/period in first sequence of alignmentr   )r   ÚnameÚdescriptionr   ÚiÚeÚqú#Nz-Error parsing alignment - unexpected line:
%sÚaTú=z1Error parsing alignment - invalid key in 'a' linec             s   s   | ]}|  d ¡V  qdS )rG   N)Úsplit)r&   Za_stringr   r   r   ú	<genexpr>ó   s    zMafIterator.<locals>.<genexpr>)ÚnextÚStopIterationÚ
startswithÚstriprH   r    r/   Úintr"   ÚzipÚappendr#   r   r   ÚAssertionErrorr   r0   ÚcountÚdict)r   Z	seq_countZin_a_bundler   ZrecordsÚlineÚ
line_splitr   r4   ÚsequenceÚrefÚnewZletterZ
ref_letterr3   Zannot_stringsr   r   r   ÚMafIterator}   sŠ    







rY   c               @   sr   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zed
d„ ƒZ	edd„ ƒZ
dd„ Zdd„ Zddd„Zdd„ Zdd„ ZdS )ÚMafIndexz¼Index for a MAF file.

    The index is a sqlite3 database that is built upon creation of the object
    if necessary, and queried when methods *search* or *get_spliced* are
    used.
    c             C   s~   || _ || _tj tj |¡¡| _|| _t| jƒ| _	tj 
|¡rXt |¡| _|  ¡ | _nt |¡| _|  ¡ | _t| j	ƒ| _dS )z)Indexes or loads the index of a MAF file.N)Ú_target_seqnameÚ_index_filenameÚosÚpathÚabspathÚdirnameÚ_relative_pathÚ	_maf_fileÚopenÚ_maf_fpÚisfiler   ZconnectÚ_conÚ_MafIndex__check_existing_dbÚ_record_countÚ_MafIndex__make_new_indexrY   Ú_mafiter)r   Zsqlite_fileZmaf_fileZtarget_seqnamer   r   r   Ú__init__  s    
zMafIndex.__init__c       	   
   C   sx  y6t | j d¡ ¡ d ƒ}|tkrDd d| d| j g¡}t|ƒ‚| j d¡ ¡ d }tj	 
|¡rj|}ntj	 | j| dtj	j¡¡}|tj	 | j¡krªtd|| jf ƒ‚| j d	¡ ¡ d }|| jkrÚtd
|| jf ƒ‚t | j d¡ ¡ d ƒ}|dkrtdƒ‚t | j d¡ ¡ d ƒ}||kr6td||f ƒ‚|S  tjtjfk
rr } ztd| ƒd‚W dd}~X Y nX dS )zEPerform basic sanity checks upon loading an existing index (PRIVATE).z1SELECT value FROM meta_data WHERE key = 'version'r   r,   z=Index version (%s) incompatible with this version of MafIndexz;You might erase the existing index %s for it to be rebuilt.z2SELECT value FROM meta_data WHERE key = 'filename'ú/z&Index uses a different file (%s != %s)z8SELECT value FROM meta_data WHERE key = 'target_seqname'z-Provided database indexed for %s, expected %sz6SELECT value FROM meta_data WHERE key = 'record_count'r   z$Unfinished/partial database providedz SELECT COUNT(*) FROM offset_dataz.Expected %s records, found %s.  Corrupt index?z Problem with SQLite database: %sN)rN   rf   ÚexecuteZfetchoneÚMAFINDEX_VERSIONr#   r\   r/   r]   r^   Úisabsra   r   Úsepr_   rb   r[   r   ZOperationalErrorZDatabaseError)	r   Zidx_versionÚmsgÚfilenameZtmp_mafpathZ	db_targetZrecord_countZrecords_foundÚerrr   r   r   Z__check_existing_db  sZ    


zMafIndex.__check_existing_dbc             C   s„  | j  d¡ | j  dt ¡ | j  d¡ | j  d| jf ¡ tj | j¡sxtj | j¡sxtj 	| j| j
¡ tjjd¡}n\tj tj | j¡¡tjj  | j
tjj ¡rÆtj 	| j| j
¡ tjjd¡}ntj | j¡}| j  d|f ¡ | j  d¡ d}|  ¡ }x@tt|d	ƒƒ}|sP | j  d
|¡ | j  ¡  |t|ƒ7 }q W | j  d¡ | j  d¡ | j  d¡ | j  d|f ¡ | j  ¡  |S )z2Read MAF file and generate SQLite index (PRIVATE).z.CREATE TABLE meta_data (key TEXT, value TEXT);z:INSERT INTO meta_data (key, value) VALUES ('version', %s);z?INSERT INTO meta_data (key, value) VALUES ('record_count', -1);zCINSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');rl   z=INSERT INTO meta_data (key, value) VALUES ('filename', '%s');zSCREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);r   éd   zCINSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);z9CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);z=CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);z9CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);z<UPDATE meta_data SET value = '%s' WHERE key = 'record_count')rf   rm   rn   r[   r]   r^   ro   rb   r\   Úrelpathra   r   rp   r`   r_   rL   Ú_MafIndex__maf_indexerÚlistr   ZexecutemanyZcommitr    )r   ZmafpathZinsert_countZmafindex_funcZbatchr   r   r   Z__make_new_indexa  sT    


zMafIndex.__make_new_indexc             c   s  | j  ¡ }x|r| d¡r| j  ¡ t|ƒ }xÎ| j  ¡ }| ¡ rP| d¡rbtd| jf ƒ‚q4| d¡r4| ¡  ¡ }|d | jkr4t	|d ƒ}t	|d ƒ}|t|d  
dd	¡ƒkrÖtd
|t|d  
dd	¡ƒf ƒ‚|| d }|  ||d ¡|||fV  P q4W | j  ¡ }qW dS )zåReturn index information for each bundle (PRIVATE).

        Yields index information for each bundle in the form of
        (bin, start, end, offset) tuples where start and end are
        0-based inclusive coordinates.
        rF   z1Target for indexing (%s) not found in this bundler   r   r	   r<   r>   r   r   z=Invalid length for target coordinates (expected %s, found %s)N)rd   ÚreadlinerL   Útellr    rM   r/   r[   rH   rN   r   Ú_ucscbin)r   rT   ÚoffsetrU   r   r   Úendr   r   r   Z__maf_indexer³  s.    



zMafIndex.__maf_indexerc             C   s    ddg}|  td| d?  d|d d?  ƒ¡ |  td| d?  d|d d?  ƒ¡ |  td| d	?  d
|d d	?  ƒ¡ |  td| d?  d|d d?  ƒ¡ t|ƒS )zªFind bins that a region may belong to (PRIVATE).

        Converts a region to a list of bins that it may belong to, including largest
        and smallest bins.
        r   r   é   r	   é	   é   é
   éI   é   éJ   iI  é   iJ  )ÚextendÚrangeÚset)r   r|   Zbinsr   r   r   Ú_region2binä  s    $$$$zMafIndex._region2binc             C   sd   dddddg}d}d}| }|d }||L }||L }x,|D ]$}||krL|| S ||L }||L }q8W dS )z—Return the smallest bin a given region will fit into (PRIVATE).

        Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
        iI  r   r~   r   r   r„   r<   r   )r   r|   Zbin_offsetsZ_bin_first_shiftZ_bin_next_shiftZ	start_binZend_binZ
bin_offsetr   r   r   rz   ô  s    
zMafIndex._ucscbinc             C   s   | j  |¡ t| jƒS )zFRetrieve a single MAF record located at the offset provided (PRIVATE).)rd   ÚseekrJ   rj   )r   r{   r   r   r   Ú_get_record  s    zMafIndex._get_recordc          
   c   s”  t |ƒt |ƒkrtdƒ‚x8t||ƒD ]*\}}|| }|dk r$td|||f ƒ‚q$W | j}tƒ }x.t||ƒD ]\}}yd tt|  ||¡ƒ¡}W n& t	k
r¸   t	d||f ƒd‚Y nX | 
d|||d |d f ¡}	|	 ¡ }
xª|
D ]¢\}}}||f|krqæn| ||f¡ |  t|ƒ¡}xb|D ]Z}|j| jkr$|jd }||jd	  d }||krf||ks$td
|||||f ƒ‚q$W |V  qæW qlW dS )aä  Search index database for MAF records overlapping ranges provided.

        Returns *MultipleSeqAlignment* results in order by start, then end, then
        internal offset field.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).
        z2Every position in starts must have a match in endsr   z7Exon coordinates (%d, %d) invalid: exon length (%d) < 1z, z4Exon coordinates must be integers (start=%d, end=%d)NzžSELECT DISTINCT start, end, offset FROM offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) ORDER BY start, end, offset ASC;r   r   z'Expected %s-%s @ offset %s, found %s-%s)r    r/   rO   rf   r‡   r#   Úmapr!   rˆ   r.   rm   ZfetchallÚaddrŠ   rN   r   r[   r   )r   ÚstartsÚendsÚ	exonstartÚexonendZexonlenZconZyielded_rec_coordsZpossible_binsÚresultZrowsÚ	rec_startÚrec_endr{   Úfetchedr$   r   r|   r   r   r   Úsearch  sJ    



zMafIndex.searchr   c           
   C   s   |dkrt d| ƒ‚t|  ||¡ƒ}tdd„ t||ƒD ƒƒ}t|ƒdkrdtttd| ƒ| j	dgƒS dd	„ |D ƒ}d
d„ |D ƒ}d}d}	x’|D ]ˆ}
x|
D ]î}|j
| j	kržyL|	dkrÔ|jd }	|	dkrøt dƒ‚n$|	|jd krøt d|jd |	f ƒ‚W n& tk
r    t d| j	 ƒd‚Y nX t|ƒ}|jd }|jd }|| d }||7 }x6|
D ].}x&t||d ƒD ]}d||j
 |< qlW qXW P qžW t d| j	f ƒ‚|}xttd|ƒD ]f}xB|
D ]:}|j
| j	krÖ|j| }||j
 |  |j| 7  < qºW |dkr°||k r°|d7 }q°W qW t|| j	 ƒ|krPt d| j	t|| j	 ƒ|f ƒ‚dd„ || j	  ¡ D ƒ}i }x®|D ]¦}|| }g }|| j	kr’dnd}|j}xjt||ƒD ]\\}}xPt||ƒD ]B}||krØ||| ƒ n$||krô||||  ƒ n||ƒ q¼W q¨W d |¡||< qrW t|| j	  dd¡ƒ|kr^t dt|| j	  dd¡ƒ| j	|f ƒ‚t|| j	 ƒ}x:| ¡ D ].\}}t|ƒ|krvt dt|ƒ||f ƒ‚qvW g }xJ| ¡ D ]>\}}t|ƒ}||	krÔ|n| ¡ }| t|||dd¡ q¶W t|ƒS )aé  Return a multiple alignment of the exact sequence range provided.

        Accepts two lists of start and end positions on target_seqname, representing
        exons to be spliced in silico.  Returns a *MultipleSeqAlignment* of the
        desired sequences spliced together.

        *starts* should be a list of 0-based start coordinates of segments in the reference.
        *ends* should be the list of the corresponding segment ends
        (in the half-open UCSC convention:
        http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/).

        To ask for the alignment portion corresponding to the first 100
        nucleotides of the reference sequence, you would use
        ``search([0], [100])``
        )r   r   zStrand must be 1 or -1, got %sc             s   s   | ]\}}|| V  qd S )Nr   )r&   r   r|   r   r   r   rI     s    z'MafIndex.get_spliced.<locals>.<genexpr>r   ÚN)r   c             S   s   h | ]}|D ]
}|j ’qqS r   )r   )r&   ÚmultiseqrV   r   r   r   r(   ˜  s    z'MafIndex.get_spliced.<locals>.<setcomp>c             S   s   i | ]
}i |“qS r   r   )r&   Zseq_namer   r   r   ú
<dictcomp>¡  s    z(MafIndex.get_spliced.<locals>.<dictcomp>Nr   zStrand must be 1 or -1z8Encountered strand='%s' on target seqname, expected '%s'z-No strand information for target seqname (%s)r   r   r   r   z#Did not find %s in alignment bundler   z/Target seqname (%s) has %s records, expected %sc             S   s&   i | ]\}}t |ƒd krt |ƒ|“qS )r   )r    )r&   ÚposZgapped_fragmentr   r   r   r˜   õ  s   z9Returning %s letters for target seqname (%s), expected %sz'Returning length %s for %s, expected %s)r   r@   rA   )r/   rw   r•   ÚsumrO   r    r   r   r   r[   r   r   ÚKeyErrorr†   r"   r1   rP   r#   r   Zreverse_complement) r   r   rŽ   r   r”   Zexpected_lettersZall_seqnamesZsplit_by_positionZtotal_rec_lengthZref_first_strandr—   ZseqrecZ
rec_lengthr’   Zungapped_lengthr“   r™   Zreal_posZ
gapped_posZ	track_valZrealpos_to_lenZsubseqZseqidZ	seq_splitZ
seq_spliceZfiller_charrP   r   r   Zref_subseq_lenr"   Zresult_multiseqr   r   r   Úget_splicedu  s®    	







"




zMafIndex.get_splicedc             C   s   d| j j| jf S )z,Return a string representation of the index.z%MafIO.MafIndex(%r, target_seqname=%r))rd   r@   r[   )r   r   r   r   Ú__repr__8  s    zMafIndex.__repr__c             C   s   | j S )z*Return the number of records in the index.)rh   )r   r   r   r   Ú__len__?  s    zMafIndex.__len__N)r   )r6   r7   r8   r9   rk   rg   ri   rv   Ústaticmethodrˆ   rz   rŠ   r•   rœ   r   rž   r   r   r   r   rZ   û   s   GR1c
 DrZ   )N)r9   r]   Ú	itertoolsr   Zsqlite3r   Z	Bio.Alignr   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   rn   r
   rY   rZ   r   r   r   r   Ú<module>"   s   L
~