B
    bd7                 @   sH   d Z ddlmZ ddlmZ ddlmZ ddlmZ G dd deZ	d	S )
ai  Bio.AlignIO support for GCG MSF format.

The file format was produced by the GCG PileUp and and LocalPileUp tools,
and later tools such as T-COFFEE and MUSCLE support it as an optional
output format.

The original GCG tool would write gaps at ends of each sequence which could
be missing data as tildes (``~``), whereas internal gaps were periods (``.``)
instead. This parser replaces both with minus signs (``-``) for consistency
with the rest of ``Bio.AlignIO``.

You are expected to use this module via the Bio.AlignIO functions (or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).
    )MultipleSeqAlignment)Seq)	SeqRecord   )AlignmentIteratorc               @   s   e Zd ZdZdZdd ZdS )MsfIteratorzGCG MSF alignment iterator.Nc          	      s  | j }| jdkr| }n| j}d| _|s.tdddg}|  d |krntd|  d d|f x|rd|kr| }qpW |std	|d
 }|d}||d  dks|d dks|d dkrtd| yt	||d  }W n tk
r   d}Y nX |dk r.td||d   ||d  }|dkrPtd| g }g }	g }
g }| }x.|r| dkr| }| 
drld|krd|krd|kr||dd d  }|d\}}|d\}}|d\}}| }|dr|dd }||kr0td| d |krFtd!| || |	t	|  |
t	|  |t|  ntd"| qlW |std#|t|	krt|	 t fd$d%|	D }td&||t| f | }|std'| rtd(d)d* |D }d}xR||k rzxt|D ]
\}}| }|dkr| sx|r| s| }qhW |std+|  }|dkr|r|d |kryt	|d }W n tk
r   d}Y nX ||d krtd,|d |f t|dkrt|dkr.d}n,yt	|d }W n tk
rX   d}Y nX ||d- |k rr|d- n|krtd.|d |d- |k r|d- n||f | }|  }|s|	| |k rtd/|| |	| krntd0||f nL|d |kr:t|dks"t||| |dd  ntd1||f q@W |d-7 }| }| r*td2| q*W xP| }|sP n8| sn,|  d |kr|| _P ntd3| q~W d4d* |D }d5}xXtt|	|D ]F\}\}}t||k rt||krd6}|d7|t|   ||< qW |r`ddl}dd8lm} |d9| d:d% t|||D }t|}| |krtd;|| f |S )<z)Parse the next alignment from the handle.Nz!!NA_MULTIPLE_ALIGNMENTz!!AA_MULTIPLE_ALIGNMENTZPileUpr   z$%s is not a known GCG MSF header: %sz, z MSF: z6Reached end of file without MSF/Type/Check header line
zMSF:   zType:)zCheck:z
CompCheck:z..zsGCG MSF header line should be '<optional text> MSF: <int> Type: <letter> <optional date> Check: <int> ..',  not: %rr   zCGCG MSF header line should have MDF: <int> for column count, not %r   )PNz]GCG MSF header line should have 'Type: P' (protein) or 'Type: N' (nucleotide), not 'Type: %s'z//zName: z Len: z Check: z	 Weight:    z oozDuplicated ID of %r zSpace in ID %rzMalformed GCG MSF name line: %rz4End of file while looking for end of header // line.c             3   s   | ]}| krd V  qdS )r   N ).0_)
max_lengthr   0lib/python3.7/site-packages/Bio/AlignIO/MsfIO.py	<genexpr>   s    z'MsfIterator.__next__.<locals>.<genexpr>zLGCG MSF header said alignment length %i, but %s of %i sequences said Len: %sz.End of file after // line, expected sequences.z4After // line, expected blank line before sequences.c             S   s   g | ]}g qS r   r   )r   r   r   r   r   
<listcomp>   s    z(MsfIterator.__next__.<locals>.<listcomp>z*End of file where expecting sequence data.z5Expected GCG MSF coordinate line starting %i, got: %r2   z2Expected GCG MSF coordinate line %i to %i, got: %r z!Expected sequence for %s, got: %rz!Expected sequence for %r, got: %rzExpected blank line, got: %rz+Unexpected line after GCG MSF alignment: %rc             S   s&   g | ]}d  |ddddqS )r   ~-.)joinreplace)r   sr   r   r   r   -  s    FTr   )BiopythonParserWarningzGOne of more alignment sequences were truncated and have been gap paddedc             s   s.   | ]&\}}}t t||||d |idV  qdS )weight)idnameZdescriptionZannotationsN)r   r   )r   ir   wr   r   r   r   ?  s   z5GCG MSF headers said alignment length %i, but have %i)handle_headerreadlineStopIterationstripsplit
ValueErrorr   indexint
startswithendswithNotImplementedErrorappendfloatmaxsumlen	enumerateAssertionErrorextendzipwarningsZBior    warnr   Zget_alignment_length)selfr&   lineZknown_headerspartsoffsetZ
aln_lengthZseq_typeZidsZlengthsZchecksZweightsrestr#   lengthZcheckr!   Z	max_countZseqsZcompleted_lengthidxZwordsr$   Zpaddedr   r;   r    ZrecordsZalignr   )r   r   __next__"   s:   


'












zMsfIterator.__next__)__name__
__module____qualname____doc__r'   rD   r   r   r   r   r      s   r   N)
rH   Z	Bio.Alignr   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   r   r   r   r   r   <module>   s
   