B
    ‰°bçP  ã               @   sÀ   d Z G dd„ dƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZG d	d
„ d
ƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZ	G dd„ dƒZ
G dd„ dƒZdd„ ZG dd„ dƒZdd„ ZdS )a“  Parser for ACE files output by PHRAP.

Written by Frank Kauff (fkauff@duke.edu) and
Cymon J. Cox (cymon@duke.edu)

Usage:

There are two ways of reading an ace file:

1. The function 'read' reads the whole file at once;
2. The function 'parse' reads the file contig after contig.

First option, parse whole ace file at once::

        from Bio.Sequencing import Ace
        acefilerecord = Ace.read(open('my_ace_file.ace'))

This gives you:
 - acefilerecord.ncontigs (the number of contigs in the ace file)
 - acefilerecord.nreads (the number of reads in the ace file)
 - acefilerecord.contigs[] (one instance of the Contig class for each contig)

The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
for this contig in a list of instances of the Read class, e.g.::

        contig3 = acefilerecord.contigs[2]
        read4 = contig3.reads[3]
        RD_of_read4 = read4.rd
        DS_of_read4 = read4.ds

CT, WA, RT tags from the end of the file can appear anywhere are automatically
sorted into the right place.

see _RecordConsumer for details.

The second option is to  iterate over the contigs of an ace file one by one
in the ususal way::

    from Bio.Sequencing import Ace
    contigs = Ace.parse(open('my_ace_file.ace'))
    for contig in contigs:
        print(contig.name)
        ...

Please note that for memory efficiency, when using the iterator approach, only one
contig is kept in memory at once.  However, there can be a footer to the ACE file
containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
Because the parser doesn't see this data until the final record, it cannot be added to
the appropriate records.  Instead these tags will be returned with the last contig record.
Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
are needed, the 'read' function rather than the 'parse' function might be more appropriate.
c               @   s   e Zd ZdZdd„ ZdS )Úrdz‡RD (reads), store a read with its name, sequence etc.

    The location and strand each read is mapped to is held in the AF lines.
    c             C   s"   d| _ d| _d| _d| _d| _dS )zInitialize the class.Ú N)ÚnameÚpadded_basesÚ
info_itemsÚ	read_tagsÚsequence)Úself© r	   ú1lib/python3.7/site-packages/Bio/Sequencing/Ace.pyÚ__init__C   s
    zrd.__init__N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r	   r	   r	   r
   r   =   s   r   c               @   s   e Zd ZdZddd„ZdS )ÚqazIQA (read quality), including which part if any was used as the consensus.Nc             C   s`   d| _ d| _d| _d| _|r\| ¡ }t|d ƒ| _ t|d ƒ| _t|d ƒ| _t|d ƒ| _dS )zInitialize the class.Né   é   é   é   )Zqual_clipping_startZqual_clipping_endZalign_clipping_startZalign_clipping_endÚsplitÚint)r   ÚlineÚheaderr	   r	   r
   r   O   s    zqa.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r   L   s   r   c               @   s   e Zd ZdZddd„ZdS )Údsz:DS lines, include file name of a read's chromatogram file.Nc          	      sØ   d| _ d| _d| _d| _d| _d| _d| _ˆ rÔdddddddg}‡ fd	d
„|D ƒ}tt||ƒƒ}d|krn|d= t	|ƒ}x\t||dd… t
ˆ ƒd g ƒD ]8\}}t| ||  ¡ ˆ |t
|| ƒ d |…  ¡ ƒ q˜W dS )zInitialize the class.r   ZCHROMAT_FILEZPHD_FILEZTIMEZCHEMZDYEÚTEMPLATEZ	DIRECTIONc                s   g | ]}ˆ   |¡‘qS r	   )Úfind)Ú.0Úx)r   r	   r
   ú
<listcomp>s   s    zds.__init__.<locals>.<listcomp>éÿÿÿÿr   N)Zchromat_fileZphd_fileZtimeZchemZdyeÚtemplateÚ	directionÚdictÚzipÚsortedÚlenÚsetattrÚlowerÚstrip)r   r   ZtagsZpossZtagposZpsZp1Zp2r	   )r   r
   r   `   s2    *
zds.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r   ]   s   r   c               @   s   e Zd ZdZddd„ZdS )ÚafzðAF lines, define the location of the read within the contig.

    Note attribute coru is short for complemented (C) or uncomplemented (U),
    since the strand information is stored in an ACE file using either the
    C or U character.
    Nc             C   sD   d| _ d| _d| _|r@| ¡ }|d | _ |d | _t|d ƒ| _dS )zInitialize the class.r   Nr   r   r   )r   ZcoruÚpadded_startr   r   )r   r   r   r	   r	   r
   r   ˆ   s    

zaf.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r)   €   s   r)   c               @   s   e Zd ZdZddd„ZdS )ÚbszKBS (base segment), which read was chosen as the consensus at each position.Nc             C   sH   d| _ d| _d| _|rD| ¡ }t|d ƒ| _t|d ƒ| _|d | _ dS )zInitialize the class.r   Nr   r   r   )r   r*   Ú
padded_endr   r   )r   r   r   r	   r	   r
   r   —   s    zbs.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r+   ”   s   r+   c               @   s   e Zd ZdZddd„ZdS )Úrtz<RT (transient read tags), generated by crossmatch and phrap.Nc             C   s~   d| _ d| _d| _d| _d| _d| _g | _|rz| ¡ }|d | _ |d | _|d | _t|d ƒ| _t|d ƒ| _|d | _dS )	zInitialize the class.r   Né    r   r   r   r   é   )	r   Útag_typeÚprogramr*   r,   ÚdateÚcommentr   r   )r   r   r   r	   r	   r
   r   ¦   s    


zrt.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r-   £   s   r-   c               @   s   e Zd ZdZddd„ZdS )ÚctzCT (consensus tags).Nc             C   s    d| _ d| _d| _d| _d| _d| _d| _g | _g | _|rœ| 	¡ }|d | _ |d | _|d | _t
|d ƒ| _t
|d ƒ| _|d | _t|ƒd	krœ|d
 | _dS )zInitialize the class.r   Nr.   r   r   r   r   r/   é   é   )r   r0   r1   r*   r,   r2   ZnotransÚinfor3   r   r   r%   )r   r   r   r	   r	   r
   r   ¼   s&    



zct.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r4   ¹   s   r4   c               @   s   e Zd ZdZddd„ZdS )ÚwazGWA (whole assembly tag), holds the assembly program name, version, etc.Nc             C   sF   d| _ d| _d| _g | _|rB| ¡ }|d | _ |d | _|d | _dS )zInitialize the class.r   r.   r   r   N)r0   r1   r2   r7   r   )r   r   r   r	   r	   r
   r   Ö   s    

zwa.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r8   Ó   s   r8   c               @   s   e Zd ZdZddd„ZdS )Úwrz	WR lines.Nc             C   sP   d| _ d| _d| _g | _|rL| ¡ }|d | _ |d | _|d | _|d | _dS )zInitialize the class.r   r.   r   r   r   N)r   Zalignedr1   r2   r   )r   r   r   r	   r	   r
   r   æ   s    


zwr.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r9   ã   s   r9   c               @   s   e Zd ZdZddd„ZdS )ÚReadsz8Holds information about a read supporting an ACE contig.Nc             C   sr   d| _ d| _d| _d| _d| _|rnt ƒ | _ | ¡ }|d | j _t|d ƒ| j _t|d ƒ| j _	t|d ƒ| j _
dS )zInitialize the class.Nr   r   r   r   )r   r   r   r-   r9   r   r   r   r   r   r   )r   r   r   r	   r	   r
   r   ÷   s    zReads.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r:   ô   s   r:   c               @   s   e Zd ZdZddd„ZdS )ÚContigz4Holds information about a contig from an ACE record.Nc             C   s–   d| _ d| _d| _d| _d| _d| _g | _g | _g | _g | _	d| _
d| _|r’| ¡ }|d | _ t|d ƒ| _t|d ƒ| _t|d ƒ| _|d | _dS )zInitialize the class.r   Nr   r   r   r   r/   )r   ZnbasesÚnreadsZ	nsegmentsZuorcr   Úqualityr)   r+   Úreadsr4   r8   r   r   )r   r   r   r	   r	   r
   r   
  s&    
zContig.__init__)N)r   r   r   r   r   r	   r	   r	   r
   r;     s   r;   c             c   sl  yt | ƒ}W n0 tk
r<   | }| d¡dkr8tdƒd‚Y nX zd}xyx| d¡rZP t|ƒ}qNW W n tk
r|   dS X t|ƒ}x(|D ] }| ¡ }|sžP | j	|7  _	qŒW x|D ]}| ¡ r¶P q¶W | d¡sÚtdƒ‚x0|D ](}| ¡ sîP |j
 dd	„ | ¡ D ƒ¡ qàW x|D ]}| ¡ rP qW xR| d
¡s8P |j t|ƒ¡ yt|ƒ}W n  tk
rt   tdƒd‚Y nX q*W x@| ¡ rŠP yt|ƒ}W n  tk
r¶   tdƒd‚Y nX q~W xR| d¡sÎP |j t|ƒ¡ yt|ƒ}W n  tk
r
   tdƒd‚Y nX qÀW x0y"x| d¡r(P t|ƒ}qW W n  tk
rX   tdƒd‚Y nX |j t|ƒ¡ x4|D ],}| ¡ }|s„P |jd j j	|7  _	qpW x|D ]}| ¡ r¦P q¦W | d¡sÐtdƒ‚t|ƒ|jd _x|D ]}| ¡ ræP qæW P | d¡rt|ƒ|jd _d}xy x| ¡ r2P t|ƒ}q&W W n tk
rZ   P Y nX | d¡rH|jd jdkr†g |jd _x¸|D ]°}| ¡ }| d¡r|dd…  ¡ rÖ|jd jd j |dd… ¡ xb|D ]4}| ¡ }| d¡röP |jd jd j |¡ qÜW n$|dkr$P n|jd j t|ƒ¡ qŒW d}q"| d¡r´|jd jdkrrg |jd _x8|D ]0}| ¡ }|dkrP |jd j t|ƒ¡ qxW d}q"| d¡rL|jdkrÒg |_yt|ƒ}W n  tk
rþ   tdƒd‚Y nX |j t|ƒ¡ x4|D ],}| ¡ }|dkr.P |jd j |¡ qW d}nà| d¡r*|jdkrjg |_yt|ƒ}W n  tk
r–   tdƒd‚Y nX |j t|ƒ¡ xz|D ]r}| ¡ }|dkrþxX|D ].}| ¡ }| d¡räP |jd j |¡ qÊW n |dkrP n|jd j |¡ q®W d}nP q"W | d¡sP qW |V  qJW W d|| k	rf| ¡  X dS ) ak  Iterate of ACE file contig by contig.

    Argument source is a file-like object or a path to a file.

    This function returns an iterator that allows you to iterate
    over the ACE file record by record::

        records = parse(source)
        for record in records:
            # do something with the record

    where each record is a Contig object.
    r.   r   z&Ace files must be opened in text mode.NZCOZBQzFailed to find BQ linec             s   s   | ]}t |ƒV  qd S )N)r   )r   r   r	   r	   r
   ú	<genexpr>S  s    zparse.<locals>.<genexpr>zAF zUnexpected end of AF blockzUnexpected end of filezBS zFailed to find end of BS blockzRD zFailed to find RD liner   zQA zFailed to find QA linezDS zRT{zCOMMENT{é   zC}Ú}zWR{zWA{zFailed to read WA blockzCT{zFailed to read CT blockZRD)ÚopenÚ	TypeErrorÚreadÚ
ValueErrorÚ
startswithÚnextÚStopIterationr;   r(   r   r=   Úextendr   r)   Úappendr+   r>   r:   r   r   r   r-   r3   Úendswithr9   r8   r7   r4   Úclose)ÚsourceÚhandler   Úrecordr	   r	   r
   Úparse!  s<   







	






 
 









rP   c               @   s    e Zd ZdZdd„ Zdd„ ZdS )ÚACEFileRecordzHolds data of an ACE file.c             C   s   d| _ d| _g | _d| _dS )zInitialize the class.N)Úncontigsr<   Úcontigsr8   )r   r	   r	   r
   r   ÿ  s    zACEFileRecord.__init__c                s”  g }g }g }x*t | jƒD ]\}‰ ˆ jrD| js6g | _| j ˆ j¡ ˆ jrˆ‡ fdd„ˆ jD ƒ}x|D ]}| j| j |¡ qdW | |¡ x¬t ˆ jƒD ]ž\}‰ˆjræ‡fdd„ˆjD ƒ}x$|D ]}| j| j| j |¡ q¼W | |¡ ˆjr”‡fdd„ˆjD ƒ}	x&|	D ]}| j| j| j |¡ qW | |	¡ q”W qW xRt | jƒD ]B\}‰ xL|D ]D}
|
j	ˆ j	krX| j| jdkrˆg | j| _| j| j 
|
¡ qXW |s¬|rHxÜt ˆ jƒD ]Î\}‰x`|D ]X}|j	ˆjj	krÆ| j| j| jdkrg | j| j| _| j| j| j 
|¡ qÆW x`|D ]X}|j	ˆjj	kr(| j| j| jdkrfg | j| j| _| j| j| j 
|¡ q(W q¸W qHW dS )zRSorts wr, rt and ct tags into the appropriate contig / read instance, if possible.c                s   g | ]}|j ˆ j kr|‘qS r	   )r   )r   Úct_tag)Úcr	   r
   r     s    z&ACEFileRecord.sort.<locals>.<listcomp>c                s   g | ]}|j ˆ jj kr|‘qS r	   )r   r   )r   Úrt_tag)Úrr	   r
   r     s    c                s   g | ]}|j ˆ jj kr|‘qS r	   )r   r   )r   Úwr_tag)rW   r	   r
   r     s    N)Ú	enumeraterS   r8   rI   r4   Úremover>   r-   r9   r   rJ   r   )r   r4   r-   r9   ÚiZnewctsr   ÚjZnewrtsZnewwrsrT   rV   rX   r	   )rU   rW   r
   Úsort  sT    







zACEFileRecord.sortN)r   r   r   r   r   r]   r	   r	   r	   r
   rQ   ü  s   rQ   c             C   sŠ   t | ƒ} tƒ }yt| ƒ}W n tk
r8   tdƒd‚Y nX | d¡sLtdƒ‚| ¡ }t|d ƒ|_t|d ƒ|_	t
t| ƒƒ|_| ¡  |S )z-Parse a full ACE file into a list of contigs.zPremature end of fileNZASzFile does not start with 'AS'.r   r   )ÚiterrQ   rG   rH   rE   rF   r   r   rR   r<   ÚlistrP   rS   r]   )rN   rO   r   Zwordsr	   r	   r
   rD   6  s    
rD   N)r   r   r   r   r)   r+   r-   r4   r8   r9   r:   r;   rP   rQ   rD   r	   r	   r	   r
   Ú<module>:   s   # \: