B
    ‰°bh  ã               @   sŠ   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddl	mZ G d	d
„ d
eƒZG dd„ de
ƒZedkr†ddlmZ eƒ  dS )a  Bio.AlignIO support for "stockholm" format (used in the PFAM database).

You are expected to use this module via the Bio.AlignIO functions (or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

For example, consider a Stockholm alignment file containing the following::

    # STOCKHOLM 1.0
    #=GC SS_cons       .................<<<<<<<<...<<<<<<<........>>>>>>>..
    AP001509.1         UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
    #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
    AE007476.1         AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
    #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----

    #=GC SS_cons       ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
    AP001509.1         CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
    #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
    AE007476.1         UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
    #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
    //

This is a single multiple sequence alignment, so you would probably load this
using the Bio.AlignIO.read() function:

    >>> from Bio import AlignIO
    >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm")
    >>> print(align)
    Alignment with 2 rows and 104 columns
    UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
    AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
    >>> for record in align:
    ...     print("%s %i" % (record.id, len(record)))
    AP001509.1 104
    AE007476.1 104

In addition to the sequences themselves, this example alignment also includes
some GR lines for the secondary structure of the sequences.  These are
strings, with one character for each letter in the associated sequence:

    >>> for record in align:
    ...     print(record.id)
    ...     print(record.seq)
    ...     print(record.letter_annotations['secondary_structure'])
    AP001509.1
    UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
    -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
    AE007476.1
    AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
    -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------

Any general annotation for each row is recorded in the SeqRecord's annotations
dictionary.  Any per-column annotation for the entire alignment in in the
alignment's column annotations dictionary, such as the secondary structure
consensus in this example:

    >>> sorted(align.column_annotations.keys())
    ['secondary_structure']
    >>> align.column_annotations["secondary_structure"]
    '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............'

You can output this alignment in many different file formats
using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method:

    >>> print(format(align, "fasta"))
    >AP001509.1
    UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
    GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
    >AE007476.1
    AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
    GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
    <BLANKLINE>

Most output formats won't be able to hold the annotation possible in a
Stockholm file:

    >>> print(format(align, "stockholm"))
    # STOCKHOLM 1.0
    #=GF SQ 2
    AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
    #=GS AP001509.1 AC AP001509.1
    #=GS AP001509.1 DE AP001509.1
    #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
    AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
    #=GS AE007476.1 AC AE007476.1
    #=GS AE007476.1 DE AE007476.1
    #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
    #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............
    //
    <BLANKLINE>

Note that when writing Stockholm files, AlignIO does not break long sequences
up and interleave them (as in the input file shown above).  The standard
allows this simpler layout, and it is more likely to be understood by other
tools.

Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to
iterate over the alignment rows as SeqRecord objects - rather than working
with Alignnment objects.

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm"):
    ...     print(record.id)
    ...     print(record.seq)
    ...     print(record.letter_annotations['secondary_structure'])
    AP001509.1
    UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
    -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
    AE007476.1
    AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
    -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------

Remember that if you slice a SeqRecord, the per-letter-annotations like the
secondary structure string here, are also sliced:

    >>> sub_record = record[10:20]
    >>> print(sub_record.seq)
    AUCGUUUUAC
    >>> print(sub_record.letter_annotations['secondary_structure'])
    -------<<<

Likewise with the alignment object, as long as you are not dropping any rows,
slicing specific columns of an alignment will slice any per-column-annotations:

    >>> align.column_annotations["secondary_structure"]
    '.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............'
    >>> part_align = align[:,10:20]
    >>> part_align.column_annotations["secondary_structure"]
    '.......<<<'

You can also see this in the Stockholm output of this partial-alignment:

    >>> print(format(part_align, "stockholm"))
    # STOCKHOLM 1.0
    #=GF SQ 2
    AP001509.1 UCAACACUCU
    #=GS AP001509.1 AC AP001509.1
    #=GS AP001509.1 DE AP001509.1
    #=GR AP001509.1 SS -------<<<
    AE007476.1 AUCGUUUUAC
    #=GS AE007476.1 AC AE007476.1
    #=GS AE007476.1 DE AE007476.1
    #=GR AE007476.1 SS -------<<<
    #=GC SS_cons .......<<<
    //
    <BLANKLINE>

é    )ÚOrderedDict)ÚMultipleSeqAlignment)ÚSeq)Ú	SeqRecordé   )ÚAlignmentIterator)ÚSequentialAlignmentWriterc               @   sJ   e Zd ZdZdddddddd	œZd
ddœZddddœZdd„ Zdd„ ZdS )ÚStockholmWriterz Stockholm/PFAM alignment writer.ÚSSÚSAÚTMÚPPÚLIÚASÚIN)Úsecondary_structureÚsurface_accessibilityÚtransmembraneÚposterior_probabilityÚligand_bindingÚactive_siteÚintronÚRFÚMM)Úreference_annotationÚ
model_maskÚOSÚOCÚLO)ÚorganismÚorganism_classificationÚlookc             C   sð   t |ƒ}| ¡ | _g | _|dkr(tdƒ‚| jdkr:tdƒ‚| j d¡ | j d| ¡ x|D ]}|  |¡ q\W |jràxjt	|j 
¡ ƒD ]X\}}|| jkr²| j d| j| |f ¡ q„|| jkr„| j d| j| d |f ¡ q„q„W | j d¡ d	S )
zóUse this to write (another) single alignment to an open file.

        Note that sequences and their annotation are recorded
        together (rather than having a block of annotation followed
        by a block of aligned sequences).
        r   zMust have at least one sequencez Non-empty sequences are requiredz# STOCKHOLM 1.0
z#=GF SQ %i
z#=GC %s %s
Ú_consz//
N)ÚlenZget_alignment_lengthÚ_length_of_sequencesÚ_ids_writtenÚ
ValueErrorÚhandleÚwriteÚ_write_recordÚcolumn_annotationsÚsortedÚitemsÚpfam_gc_mappingÚpfam_gr_mapping)ÚselfÚ	alignmentÚcountÚrecordÚkÚv© r5   ú6lib/python3.7/site-packages/Bio/AlignIO/StockholmIO.pyÚwrite_alignment¸   s(    




zStockholmWriter.write_alignmentc          	   C   sb  | j t|jƒkrtdƒ‚|j}|jdk	rHd|jkrH|j|jd krH|j}| dd¡}d|jkr°d|jkr°d|jd |jd f }|t|ƒ d… |kr°d	||jd |jd f }|| jkrÆtd
| ƒ‚| j 	|¡ | j
 d||jf ¡ d|jkr| j
 d||  |jd ¡f ¡ n$|jr:| j
 d||  |j¡f ¡ |jr^| j
 d||  |j¡f ¡ x*|jD ] }| j
 d||  |¡f ¡ qfW x\|j ¡ D ]N\}}|| jkr–|  t|ƒ¡}|râ| j
 d||  | j| ¡|f ¡ n q–W xt|j ¡ D ]f\}}|| jkrôtt|ƒƒt|jƒkrô|  t|ƒ¡}|rX| j
 d||  | j| ¡|f ¡ n qôW dS )z/Write a single SeqRecord to the file (PRIVATE).z%Sequences must all be the same lengthNÚ	accessionú Ú_ÚstartÚendz/%s-%sz%s/%s-%szDuplicate record identifier: %sz%s %s
z#=GS %s AC %s
z#=GS %s DE %s
z#=GS %s DR %s
z#=GS %s %s %s
z#=GR %s %s %s
)r$   r#   Úseqr&   ÚidÚnameÚannotationsÚreplacer%   Úappendr'   r(   ZcleanÚdescriptionÚdbxrefsr,   Úpfam_gs_mappingÚstrÚletter_annotationsr.   )r/   r2   Zseq_nameÚsuffixZxrefÚkeyÚvalueÚdatar5   r5   r6   r)   Ü   s`    


 $zStockholmWriter._write_recordN)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r.   r-   rE   r7   r)   r5   r5   r5   r6   r	   ¥   s   
$r	   c               @   s^   e Zd ZdZdddddddd	œZd
ddœZddddœZdZdd„ Zdd„ Z	dd„ Z
dd„ ZdS )ÚStockholmIteratoraá  Loads a Stockholm file from PFAM into MultipleSeqAlignment objects.

    The file may contain multiple concatenated alignments, which are loaded
    and returned incrementally.

    This parser will detect if the Stockholm file follows the PFAM
    conventions for sequence specific meta-data (lines starting #=GS
    and #=GR) and populates the SeqRecord fields accordingly.

    Any annotation which does not follow the PFAM conventions is currently
    ignored.

    If an accession is provided for an entry in the meta data, IT WILL NOT
    be used as the record.id (it will be recorded in the record's
    annotations).  This is because some files have (sub) sequences from
    different parts of the same accession (differentiated by different
    start-end positions).

    Wrap-around alignments are not supported - each sequences must be on
    a single line.  However, interlaced sequences should work.

    For more information on the file format, please see:
    http://sonnhammer.sbc.su.se/Stockholm.html
    https://en.wikipedia.org/wiki/Stockholm_format
    http://bioperl.org/formats/alignment_formats/Stockholm_multiple_alignment_format.html

    For consistency with BioPerl and EMBOSS we call this the "stockholm"
    format.
    r   r   r   r   r   r   r   )r
   r   r   r   r   r   r   r   r   )r   r   r   r    r!   )r   r   r   Nc             C   s  | j }| jdkr| ¡ }n| j}d| _|s.t‚| ¡ dkrBtdƒ‚i }tƒ }i }i }i }i }d}	x | ¡ }|srP | ¡ }|dkrŒ|| _P qd|dkršd}	qd|dkr¤qd|d d	kr(|	rºt‚d
d„ | dd¡D ƒ}
t	|
ƒdkrètd| ƒ‚|
\}}||krd||< | 
|d¡ ||  | dd¡7  < qdt	|ƒdkrd|dd… dkrˆ|dd…  ¡  dd¡\}}||krx|g||< n||  |¡ qd|dd… dkrÞ|dd…  ¡  dd¡\}}||krÈd||< ||  | ¡ 7  < qd|dd… dkrŽy"|dd…  ¡  dd¡\}}}W n6 tk
rH   |dd…  ¡  dd¡\}}d}Y nX ||kr\i ||< ||| krz|g|| |< n|| |  |¡ qd|dd… dkrd|dd…  ¡  dd¡\}}}||krÎi ||< ||| krèd|| |< || |  | ¡ 7  < qdW t	|ƒt	|ƒkst‚| ¡ | _|| _|| _|| _|rü|rü| jdk	rt| jt	|ƒkrttdt	|ƒ| jf ƒ‚t	t| ¡ ƒd ƒ}g }xž|D ]–}|| }|t	|ƒkr´tdƒ‚|  |¡\}}}tt|ƒ|||d|id}||jd< |dk	rü||jd< |dk	r||jd< |  ||¡ | |¡ q’W x:| ¡ D ].\}}t	|ƒ|kr6td|t	|ƒ|f ƒ‚q6W t|ƒ}x€t| ¡ ƒD ]p\}}|| jkr¤||j| j| < nH| d¡rÞ|dd… | j krÞ||j| j |dd…  < n||jd | < q~W ||_!|S t‚dS )!z)Parse the next alignment from the handle.Nz# STOCKHOLM 1.0zDid not find STOCKHOLM headerFz//TÚ r   ú#c             S   s   g | ]}|  ¡ ‘qS r5   )Ústrip)Ú.0Úxr5   r5   r6   ú
<listcomp>™  s    z.StockholmIterator.__next__.<locals>.<listcomp>r9   r   é   z3Could not split line into identifier and sequence:
Ú.ú-é   z#=GF z#=GC z#=GS z#=GR z5Found %i records in this alignment, told to expect %iz8Sequences have different lengths, or repeated identifierr8   )r>   r?   rC   r@   r;   r<   z%s length %i, expected %ir"   éûÿÿÿzGC:)"r'   Ú_headerÚreadlineÚStopIterationrS   r&   r   ÚAssertionErrorÚsplitr#   Ú
setdefaultrA   rB   ÚkeysÚidsÚ	sequencesÚseq_annotationÚseq_col_annotationZrecords_per_alignmentÚlistÚvaluesÚ_identifier_splitr   r   r@   Ú_populate_meta_datar,   r   r+   r-   r*   Úendswithr.   Z_annotations)r/   r'   ÚlineZseqsrc   ZgsZgrZgfÚgcZpassed_end_alignmentÚpartsZseq_idr=   ÚfeatureÚtextZalignment_lengthZrecordsr?   r;   r<   r2   r3   r4   r0   r5   r5   r6   Ú__next__h  sä    





"









 zStockholmIterator.__next__c             C   sf   d|kr\|  dd¡\}}| d¡dkr\y | d¡\}}|t|ƒt|ƒfS  tk
rZ   Y nX |ddfS )zDReturn (name, start, end) string tuple from an identifier (PRIVATE).ú/r   rY   N)Úrsplitr1   r`   Úintr&   )r/   Ú
identifierr?   Z	start_endr;   r<   r5   r5   r6   ri     s    z#StockholmIterator._identifier_splitc       
   	   C   sz   |   |¡\}}}||kr |g}n||g}i }xH|D ]@}y&x || D ]}	|| |	 ||	< qBW W q2 tk
rp   Y q2X q2W |S )aÓ  Take an itentifier and returns dict of all meta-data matching it (PRIVATE).

        For example, given "Q9PN73_CAMJE/149-220" will return all matches to
        this or "Q9PN73_CAMJE" which the identifier without its /start-end
        suffix.

        In the example below, the suffix is required to match the AC, but must
        be removed to match the OS and OC meta-data::

            # STOCKHOLM 1.0
            #=GS Q9PN73_CAMJE/149-220  AC Q9PN73
            ...
            Q9PN73_CAMJE/149-220               NKA...
            ...
            #=GS Q9PN73_CAMJE OS Campylobacter jejuni
            #=GS Q9PN73_CAMJE OC Bacteria

        This function will return an empty dictionary if no data is found.
        )ri   ÚKeyError)
r/   ru   Z	meta_dictr?   r;   r<   Zidentifier_keysZanswerZidentifier_keyZfeature_keyr5   r5   r6   Ú_get_meta_data,  s    

z StockholmIterator._get_meta_datac             C   s
  |   || j¡}x¨|D ] }|dkrHt|| ƒdks4t‚|| d |jd< q|dkrbd || ¡|_q|dkrv|| |_q|| jkrœd || ¡|j| j| < qd || ¡|jd	| < qW |   || j	¡}x>|D ]6}|| j
krð|| |j| j
| < qÌ|| |jd
| < qÌW dS )z~Add meta-date to a SecRecord's annotations dictionary (PRIVATE).

        This function applies the PFAM conventions.
        ZACr   r   r8   ZDEÚ
ZDRz, zGS:zGR:N)rw   re   r#   r_   r@   ÚjoinrC   rD   rE   rf   r.   rG   )r/   ru   r2   Zseq_dataro   Zseq_col_datar5   r5   r6   rj   N  s&    



z%StockholmIterator._populate_meta_data)rL   rM   rN   rO   r.   r-   rE   r\   rq   ri   rw   rj   r5   r5   r5   r6   rP   7  s    
 8"rP   Ú__main__)Úrun_doctestN)rO   Úcollectionsr   Z	Bio.Alignr   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   r   r	   rP   rL   Z
Bio._utilsr{   r5   r5   r5   r6   Ú<module>š   s      >