B
    b[4                 @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 ed	Zed
ZdZdd ZG dd de
ZG dd de	ZdS )aU  Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve.

You are expected to use this module via the Bio.AlignIO functions (or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

For example, consider a progressiveMauve alignment file containing the following::

    #FormatVersion Mauve1
    #Sequence1File	a.fa
    #Sequence1Entry	1
    #Sequence1Format	FastA
    #Sequence2File	b.fa
    #Sequence2Entry	2
    #Sequence2Format	FastA
    #Sequence3File	c.fa
    #Sequence3Entry	3
    #Sequence3Format	FastA
    #BackboneFile	three.xmfa.bbcols
    > 1:0-0 + a.fa
    --------------------------------------------------------------------------------
    --------------------------------------------------------------------------------
    --------------------------------------------------------------------------------
    > 2:5417-5968 + b.fa
    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC-
    --------------------------------------------------------------------------------
    CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG
    > 3:9476-10076 - c.fa
    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG
    TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG
    ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT
    =
    > 2:5969-6015 + b.fa
    -----------------------
    GGGCGAACGTATAAACCATTCTG
    > 3:9429-9476 - c.fa
    TTCGGTACCCTCCATGACCCACG
    AAATGAGGGCCCAGGGTATGCTT

This is a multiple sequence alignment with multiple aligned sections, so you
would probably load this using the Bio.AlignIO.parse() function:

    >>> from Bio import AlignIO
    >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve")
    >>> alignments = list(align)
    >>> for aln in alignments:
    ...     print(aln)
    ...
    Alignment with 3 rows and 240 columns
    --------------------------------------------...--- a.fa
    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968
    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076
    Alignment with 2 rows and 46 columns
    -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015
    TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476

Additional information is extracted from the XMFA file and available through
the annotation attribute of each record::

    >>> for record in alignments[0]:
    ...     print(record.id, len(record))
    ...     print("  start: %d, end: %d, strand: %d" %(
    ...         record.annotations['start'], record.annotations['end'],
    ...         record.annotations['strand']))
    ...
    a.fa 240
      start: 0, end: 0, strand: 1
    b.fa/5416-5968 240
      start: 5416, end: 5968, strand: 1
    c.fa/9475-10076 240
      start: 9475, end: 10076, strand: -1

    N)MultipleSeqAlignment)Seq)	SeqRecord   )AlignmentIterator)SequentialAlignmentWriterzG> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)z]> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)z<> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}c             C   s8   |  d\}}}tt| d\}}|d8 }||||fS )zDReturn (name, start, end) string tuple from an identifier (PRIVATE).:-r   )splitmapint)Z
identifieridZlocstrandstartend r   2lib/python3.7/site-packages/Bio/AlignIO/MauveIO.py_identifier_splitb   s    r   c                   s2   e Zd ZdZ fddZdd Zd	ddZ  ZS )
MauveWriterzMauve/XMFA alignment writer.c                s   t  j|| d| _d| _dS )zInitialize the class.FN)super__init___wrote_header_wrote_first)selfargskwargs)	__class__r   r   r   m   s    zMauveWriter.__init__c             C   s   t |}| | _|dkr"td| jdkr4td| jsxd| _| jd x*td|d D ]}| jd||f  q\W x"t|D ]\}}| j	||d qW | jd	 d
S )zUse this to write (another) single alignment to an open file.

        Note that sequences and their annotation are recorded
        together (rather than having a block of annotation followed
        by a block of aligned sequences).
        r   zMust have at least one sequencez Non-empty sequences are requiredTz#FormatVersion Mauve1
r   z#Sequence%sEntry	%s
)
record_idxz=
N)
lenZget_alignment_length_length_of_sequences
ValueErrorr   handlewriterange	enumerate_write_record)r   Z	alignmentcountiidxrecordr   r   r   write_alignments   s    

zMauveWriter.write_alignmentr   c       	      C   sH  | j t|jkrtd|j}ytt|j}W n  tk
rP   t|d }Y nX d|jkrd|jkrd|jd |jd f }d|jd d |jd f }|t| d |kr|dt|  }|t| d |kr|dt|  }d|jkrVd|jkrVd|jkrVtj	||jd d |jd |jd dkr<dnd	|jd
 |j
d}d}n"tj	|ddd|jd
 |j
d}d}d|ksd|kr|s| jsDd| _tj	|ddd|jd
 |j
d}|dddd}| j|d  n`|dddd}| j|d  x:tdt|jdD ]$}| jd|j||d    qW dS )z/Write a single SeqRecord to the file (PRIVATE).z%Sequences must all be the same lengthr   r   r   z/%s-%sNr   +r	   z.fa)seq_namer   r   r   filenameZ	ugly_hackFr   Tz:0-0 z:1-0 
 z

P   z%s
)r   r   seqr    namestrr   annotationsID_LINE_FMTformatr   r   replacer!   r"   r#   )	r   r)   r   r,   Zsuffix0Zsuffix1Zid_lineZlacking_annotationsr'   r   r   r   r%      sj    


zMauveWriter._write_record)r   )__name__
__module____qualname____doc__r   r*   r%   __classcell__r   r   )r   r   r   j   s   !r   c               @   s   e Zd ZdZg Zdd ZdS )MauveIteratorzMauve xmfa alignment iterator.c          	   C   sT  | j }| }|stx|r4| dr4| }qW i }i }d}d}x4|sPP | }|drhP n
|drFt|}|st|}|std||	d}i }	xjdD ]b}
yF|	|
}|
d	krt
|}|d
kr|d8 }|
dkrt
|}||	|
< W q tk
r   Y qX qW |	||< || jkr4| j| ||d |}n,|rPt|dkrbtd||  |7  < | }qJW t|t| jkst| j| _|| _| jrL|rLtttt| }g }xt| jD ]h}||ks
t|| d
ks
t|| d
krd| }n|| }|t|kr2td||kr@q|| d	 d
ksd|| d d
krdjf || }d|| kr|| d }n|| d }||d
kr||7 }n(d|| kr|| d }n|| d }tt|||d}|| d	 |jd	< || d |jd< || d dkr*dnd|jd< || qW t|S tdS )z)Parse the next alignment from the handle.#FN=>zMalformed header line: %sr   )r   r   r   r   r3   realnamer   r   r   r    z#Saw sequence before definition liner	   z8Sequences have different lengths, or repeated identifierz/{start}-{end}rB   r3   )r   r3   r   r+   )r!   readlineStopIterationstrip
startswithXMFA_HEADER_REGEX_BIOPYTHONmatchXMFA_HEADER_REGEXr    groupr   
IndexError_idsappend
setdefaultAssertionErrorr   Zids	sequencesmaxr   listvaluesr7   r&   r   r   r5   r   )r   r!   lineZseqsZseq_regionsZpassed_end_alignmentZ	latest_idmZ	parsed_idZparsed_datakeyvalueZalignment_lengthZrecordsr   r2   suffixZcorrected_idr)   r   r   r   __next__   s    









.

$
 zMauveIterator.__next__N)r9   r:   r;   r<   rN   r[   r   r   r   r   r>      s   r>   )r<   reZ	Bio.Alignr   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   r   compilerK   rI   r6   r   r   r>   r   r   r   r   <module>N   s   {