B
    ‰°bà  ã               @   sz  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	lm
Z
 dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ G dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZd/dd„Zdd„ Zdd„ ZG d d!„ d!eƒZG d"d#„ d#eƒZG d$d%„ d%eƒZG d&d'„ d'eƒZd(d)„ Zd*d+„ Ze d,krvdd-l!m"Z" e"dd. dS )0aÊ  Bio.SeqIO support for the "genbank" and "embl" file formats.

You are expected to use this module via the Bio.SeqIO functions.
Note that internally this module calls Bio.GenBank to do the actual
parsing of GenBank, EMBL and IMGT files.

See Also:
International Nucleotide Sequence Database Collaboration
http://www.insdc.org/

GenBank
http://www.ncbi.nlm.nih.gov/Genbank/

EMBL Nucleotide Sequence Database
http://www.ebi.ac.uk/embl/

DDBJ (DNA Data Bank of Japan)
http://www.ddbj.nig.ac.jp/

IMGT (use a variant of EMBL format with longer feature indents)
http://imgt.cines.fr/download/LIGM-DB/userman_doc.html
http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html
http://www.ebi.ac.uk/imgt/hla/docs/manual.html

é    N)Údatetime)ÚBiopythonWarning)Ú
SeqFeature)ÚSeqIO)Ú_ImgtScanner)ÚEmblScanner)ÚGenBankScanner)ÚUndefinedSequenceError)Ú
UnknownSeqé   )Ú_get_seq_string)ÚSequenceIterator)ÚSequenceWriterc                   s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚGenBankIteratorzParser for GenBank files.c                s   t ƒ j|ddd dS )aã  Break up a Genbank file into SeqRecord objects.

        Argument source is a file-like object opened in text mode or a path to a file.
        Every section from the LOCUS line to the terminating // becomes
        a single SeqRecord with associated annotation and features.

        Note that for genomes or chromosomes, there is typically only
        one record.

        This gets called internally by Bio.SeqIO for the GenBank file format:

        >>> from Bio import SeqIO
        >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"):
        ...     print(record.id)
        ...
        X55053.1
        X62281.1
        M81224.1
        AJ237582.1
        L31939.1
        AF297471.1

        Equivalently,

        >>> with open("GenBank/cor6_6.gb") as handle:
        ...     for record in GenBankIterator(handle):
        ...         print(record.id)
        ...
        X55053.1
        X62281.1
        M81224.1
        AJ237582.1
        L31939.1
        AF297471.1

        ÚtÚGenBank)ÚmodeÚfmtN)ÚsuperÚ__init__)ÚselfÚsource)Ú	__class__© ú0lib/python3.7/site-packages/Bio/SeqIO/InsdcIO.pyr   =   s    %zGenBankIterator.__init__c             C   s   t dd |¡}|S )z9Start parsing the file, and return a SeqRecord generator.r   )Údebug)r   Úparse_records)r   ÚhandleÚrecordsr   r   r   Úparsed   s    zGenBankIterator.parse)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   Ú__classcell__r   r   )r   r   r   :   s   'r   c                   s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚEmblIteratorzParser for EMBL files.c                s   t ƒ j|ddd dS )aS  Break up an EMBL file into SeqRecord objects.

        Argument source is a file-like object opened in text mode or a path to a file.
        Every section from the LOCUS line to the terminating // becomes
        a single SeqRecord with associated annotation and features.

        Note that for genomes or chromosomes, there is typically only
        one record.

        This gets called internally by Bio.SeqIO for the EMBL file format:

        >>> from Bio import SeqIO
        >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"):
        ...     print(record.id)
        ...
        A00022.1
        A00028.1
        A00031.1
        A00034.1
        A00060.1
        A00071.1
        A00072.1
        A00078.1
        CQ797900.1

        Equivalently,

        >>> with open("EMBL/epo_prt_selection.embl") as handle:
        ...     for record in EmblIterator(handle):
        ...         print(record.id)
        ...
        A00022.1
        A00028.1
        A00031.1
        A00034.1
        A00060.1
        A00071.1
        A00072.1
        A00078.1
        CQ797900.1

        r   ÚEMBL)r   r   N)r   r   )r   r   )r   r   r   r   m   s    +zEmblIterator.__init__c             C   s   t dd |¡}|S )z9Start parsing the file, and return a SeqRecord generator.r   )r   )r   r   )r   r   r   r   r   r   r   š   s    zEmblIterator.parse)r    r!   r"   r#   r   r   r$   r   r   )r   r   r%   j   s   -r%   c                   s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚImgtIteratorzParser for IMGT files.c                s   t ƒ j|ddd dS )au  Break up an IMGT file into SeqRecord objects.

        Argument source is a file-like object opened in text mode or a path to a file.
        Every section from the LOCUS line to the terminating // becomes
        a single SeqRecord with associated annotation and features.

        Note that for genomes or chromosomes, there is typically only
        one record.
        r   ZIMGT)r   r   N)r   r   )r   r   )r   r   r   r   £   s    
zImgtIterator.__init__c             C   s   t dd |¡}|S )z9Start parsing the file, and return a SeqRecord generator.r   )r   )r   r   )r   r   r   r   r   r   r   ¯   s    zImgtIterator.parse)r    r!   r"   r#   r   r   r$   r   r   )r   r   r'       s   r'   c                   s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚGenBankCdsFeatureIteratorzDParser for GenBank files, creating a SeqRecord for each CDS feature.c                s   t ƒ j|ddd dS )ah  Break up a Genbank file into SeqRecord objects for each CDS feature.

        Argument source is a file-like object opened in text mode or a path to a file.

        Every section from the LOCUS line to the terminating // can contain
        many CDS features.  These are returned as with the stated amino acid
        translation sequence (if given).
        r   r   )r   r   N)r   r   )r   r   )r   r   r   r   ¸   s    	z"GenBankCdsFeatureIterator.__init__c             C   s   t dd |¡S )z9Start parsing the file, and return a SeqRecord generator.r   )r   )r   Úparse_cds_features)r   r   r   r   r   r   Ã   s    zGenBankCdsFeatureIterator.parse)r    r!   r"   r#   r   r   r$   r   r   )r   r   r(   µ   s   r(   c                   s(   e Zd ZdZ‡ fdd„Zdd„ Z‡  ZS )ÚEmblCdsFeatureIteratorzAParser for EMBL files, creating a SeqRecord for each CDS feature.c                s   t ƒ j|ddd dS )ae  Break up a EMBL file into SeqRecord objects for each CDS feature.

        Argument source is a file-like object opened in text mode or a path to a file.

        Every section from the LOCUS line to the terminating // can contain
        many CDS features.  These are returned as with the stated amino acid
        translation sequence (if given).
        r   r&   )r   r   N)r   r   )r   r   )r   r   r   r   Ë   s    	zEmblCdsFeatureIterator.__init__c             C   s   t dd |¡S )z9Start parsing the file, and return a SeqRecord generator.r   )r   )r   r)   )r   r   r   r   r   r   Ö   s    zEmblCdsFeatureIterator.parse)r    r!   r"   r#   r   r   r$   r   r   )r   r   r*   È   s   r*   c                sî   t | tjƒrd| jˆ   S t | tjƒrDd| jˆ  | j| j ˆ  f S t | tjƒrnd| jˆ  | j| j ˆ  f S t | tjƒrˆd| jˆ   S t | tjƒr¢d| jˆ   S t | tj	ƒrÌdd 
‡ fdd	„| jD ƒ¡ S t | tjƒrâtd
ƒ‚ntdƒ‚dS )zƒBuild a GenBank/EMBL position string (PRIVATE).

    Use offset=1 to add one to convert a start position from python counting.
    z%iz(%i.%i)z(%i^%i)z<%iz>%iz
one-of(%s)ú,c             3   s   | ]}t |ˆ ƒV  qd S )N)Ú_insdc_feature_position_string)Ú.0Úp)Úoffsetr   r   ú	<genexpr>ò   s    z1_insdc_feature_position_string.<locals>.<genexpr>z)Please report this as a bug in Biopython.z&Expected a SeqFeature position object.N)Ú
isinstancer   ÚExactPositionÚpositionZWithinPositionÚ	extensionZBetweenPositionZBeforePositionZAfterPositionZOneOfPositionÚjoinZposition_choicesZAbstractPositionÚNotImplementedErrorÚ
ValueError)Úposr/   r   )r/   r   r,   Û   s(    
r,   c             C   sl  | j rd| j  }nd}| jr t‚t| jtjƒr~t| jtjƒr~| jj| jjkr~| jj|krdd||f S d|| jj| jjd f S t| jtjƒr¾t| jtjƒr¾| jjd | jjkr¾d|| jjf S t| jtj	ƒsÜt| jtj	ƒrJt| jtj	ƒrt| jtj	ƒrt
dƒ‚nBt| jtj	ƒr,d|| jt| jƒf S d	|t| jdƒ| jd f S n|t| jdƒ d
 t| jƒ S d S )Nz%s:Ú z%s%i^1z%s%i^%ir   z%s%izFeature with unknown locationz	%s<%i..%sz	%s%s..>%iz..)ÚrefZref_dbÚAssertionErrorr1   Ústartr   r2   Úendr3   ZUnknownPositionr7   Únofuzzy_endr,   Únofuzzy_start)ÚlocationÚ
rec_lengthr:   r   r   r   Ú6_insdc_location_string_ignoring_strand_and_subfeaturesú   s:    


rB   c          	      sœ   yb| j }| jdkr>d| jd ‡ fdd„|ddd… D ƒ¡f S d| jd ‡ fdd„|D ƒ¡f S W n4 tk
r–   t| ˆ ƒ}| jdkrŽd	| S |S Y nX dS )
aY  Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE).

    There is a choice of how to show joins on the reverse complement strand,
    GenBank used "complement(join(1,10),(20,100))" while EMBL used to use
    "join(complement(20,100),complement(1,10))" instead (but appears to have
    now adopted the GenBank convention). Notice that the order of the entries
    is reversed! This function therefore uses the first form. In this situation
    we expect the CompoundFeatureLocation and its parts to all be marked as
    strand == -1, and to be in the order 19:100 then 0:10.
    éÿÿÿÿzcomplement(%s(%s))r+   c             3   s   | ]}t |ˆ ƒV  qd S )N)rB   )r-   r.   )rA   r   r   r0   K  s   z)_insdc_location_string.<locals>.<genexpr>Nz%s(%s)c             3   s   | ]}t |ˆ ƒV  qd S )N)Ú_insdc_location_string)r-   r.   )rA   r   r   r0   T  s    zcomplement(%s))ÚpartsZstrandÚoperatorr5   ÚAttributeErrorrB   )r@   rA   rE   Zlocr   )rA   r   rD   8  s"    

 
rD   c               @   sd   e Zd ZdZdZdZde ZdZdZddd	„Z	d
d„ Z
dd„ Zeddd„ƒZedd„ ƒZdd„ ZdS )Ú_InsdcWriterz2Base class for GenBank and EMBL writers (PRIVATE).éP   é   ú z     %s                )Z	anticodonZcitationZcodon_startZcompareÚ	directionZestimated_lengthZmod_baseÚnumberZrpt_typeZrpt_unit_rangeZtag_peptideZtransl_exceptZtransl_tableNc             C   sj  |d kr"| j  d| j|f ¡ d S t|ƒtkr:| dd¡}|d kr`t|tƒsV|| jkr\d}nd}|rvd| j||f }nd| j||f }t	|ƒ| j
kr¨| j  |d ¡ d S x¼| ¡ rdt	|ƒ| j
krÖ| j  |d ¡ d S x8ttt	|ƒd	 | j
ƒ| jd	 d
ƒD ]}|| dkrúP qúW || dkr$| j
}|| j
ks4t‚| j  |d |… d ¡ | j||d …  ¡  }qªW d S )Nz%s/%s
ú"z""FTz
%s/%s="%s"z%s/%s=%sÚ
r   rC   rK   )r   ÚwriteÚQUALIFIER_INDENT_STRÚtypeÚstrÚreplacer1   ÚintÚFTQUAL_NO_QUOTEÚlenÚ	MAX_WIDTHÚlstripÚrangeÚminÚQUALIFIER_INDENTr;   )r   ÚkeyÚvalueZquoteÚlineÚindexr   r   r   Ú_write_feature_qualifierx  s:    $z%_InsdcWriter._write_feature_qualifierc             C   sz   | j | j }t|ƒ|kr|S |d|…  d¡}|dkrJt d| t¡ |S |d|d … d | j |  ||d d… ¡ S )z@Split a feature location into lines (break at commas) (PRIVATE).Nr+   rC   zCouldn't split location:
%sr   rO   )	rX   r\   rW   ÚrfindÚwarningsÚwarnr   rQ   Ú_wrap_location)r   r@   Úlengthr`   r   r   r   re   £  s    z_InsdcWriter._wrap_locationc       	      C   s¨   |j st|ƒ‚t|j|ƒ}|j  dd¡}| j| d| j… |  |¡ d }| j 	|¡ xL|j
 ¡ D ]>\}}t|ttfƒr”x&|D ]}|  ||¡ q~W qb|  ||¡ qbW dS )z=Write a single SeqFeature object to features table (PRIVATE).rK   Ú_NrO   )rR   r;   rD   r@   rT   ÚQUALIFIER_INDENT_TMPr\   re   r   rP   Z
qualifiersÚitemsr1   ÚlistÚtuplera   )	r   ÚfeatureZrecord_lengthr@   Zf_typer_   r]   Úvaluesr^   r   r   r   Ú_write_featureµ  s    "
z_InsdcWriter._write_featureÚ.Fc             C   sZ   y| j | }W n tk
r"   |S X t|tƒrN|sBt|ƒdksBt‚t|d ƒS t|ƒS dS )a  Get an annotation dictionary entry (as a string) (PRIVATE).

        Some entries are lists, in which case if just_first=True the first entry
        is returned.  If just_first=False (default) this verifies there is only
        one entry before returning it.
        r   r   N)ÚannotationsÚKeyErrorr1   rj   rW   r;   rS   )Úrecordr]   ÚdefaultÚ
just_firstÚanswerr   r   r   Ú_get_annotation_strÊ  s    
z _InsdcWriter._get_annotation_strc             C   sÖ   |   ¡ } t| ƒ|kr| gS |  ¡ }d} x>|rdt| ƒd t|d ƒ |krd| d| d¡ 7 } |   ¡ } q(W | g}x\|rÈ| d¡} x>|rºt| ƒd t|d ƒ |krº| d| d¡ 7 } |   ¡ } q~W | | ¡ qnW |rÒt‚|S )z«Return a list of strings (PRIVATE).

        Any single words which are too long get returned as a whole line
        (e.g. URLs) without an exception or warning.
        r9   r   r   rK   )ÚstriprW   ÚsplitÚpopÚappendr;   )ÚtextÚmax_lenÚwordsru   r   r   r   Ú_split_multi_lineÝ  s"    "
"z_InsdcWriter._split_multi_linec             C   s¬   |j  dd¡}t|ttfƒr&d |¡}|  |¡}g }xr|r¦t|ƒ|kr|d|d …  d¡}|dkrlt	dƒ‚|d|d … ||d d…  }}n
|d }}| 
|¡ q6W |S )z5Return a list of strings, splits on commas (PRIVATE).Úcontigr9   Nr   r+   rC   zCould not break up CONTIG)rp   Úgetr1   rj   rk   r5   ZcleanrW   rb   r7   rz   )r   rr   r|   r   ru   r8   r{   r   r   r   Ú_split_contigú  s    

$
z_InsdcWriter._split_contig)NN)ro   F)r    r!   r"   r#   rX   r\   rQ   rh   rV   ra   re   rn   Ústaticmethodrv   r~   r   r   r   r   r   rH   a  s   
+rH   c               @   s”   e Zd ZdZdZdZdZdZdZdZ	dZ
d	d
„ Zdd„ Zdd„ Zedd„ ƒZedd„ ƒZdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd „ Zd!S )"ÚGenBankWriterzGenBank writer.é   rJ   z-START##z-END##z :: é<   é	   c             C   sx   t |ƒ| jk st‚t |ƒ| j| j krP|r@t d||f t¡ nt d| t¡ | j d| 	| j¡| 
dd¡f ¡ dS )zrWrite single line in each GenBank record (PRIVATE).

        Used in the 'header' of each GenBank record.
        z"Annotation %r too long for %r linezAnnotation %r too longz%s%s
rO   rK   N)rW   ÚHEADER_WIDTHr;   rX   rc   rd   r   r   rP   ÚljustrT   )r   Útagr{   r   r   r   Ú_write_single_line  s    z GenBankWriter._write_single_linec             C   sN   | j | j }|  ||¡}|  ||d ¡ x |dd… D ]}|  d|¡ q6W dS )zuWrite multiple lines in each GenBank record (PRIVATE).

        Used in the 'header' of each GenBank record.
        r   r   Nr9   )rX   r‡   r~   rŠ   )r   r‰   r{   r|   Úlinesr_   r   r   r   Ú_write_multi_line.  s
    zGenBankWriter._write_multi_linec             C   s<   x6t |ƒD ]*\}}|dkr(|  ||¡ q
|  d|¡ q
W d S )Nr   r9   )Ú	enumeraterŠ   )r   r‰   Z	text_listÚir{   r   r   r   Ú_write_multi_entries:  s    z"GenBankWriter._write_multi_entriesc             C   sì   d}y| j d }W n tk
r&   |S X t|tƒrFt|ƒdkrF|d }t|tƒr^| d¡ ¡ }dddd	d
dddddddg}t|tƒrt|ƒdkr”|S y:tt	|dd … ƒ| 
|dd… ¡d t	|dd… ƒƒ W n tk
ræ   |}Y nX |S )Nz01-JAN-1980Údater   r   z%d-%b-%YZJANZFEBZMARZAPRZMAYZJUNZJULZAUGZSEPZOCTZNOVZDECé   éüÿÿÿé   é   é   )rp   rq   r1   rj   rW   r   ZstrftimeÚupperrS   rU   r`   r7   )rr   rs   r   Zmonthsr   r   r   Ú	_get_dateC  s8    
:
zGenBankWriter._get_datec             C   s~   y| j d }W n tk
r&   d}Y nX |dkr2n8dddddddœ}y|| }W n tk
rh   d}Y nX t|ƒd	kszt‚|S )
NÚdata_file_divisionÚUNK)ÚPRIÚRODÚMAMÚVRTÚINVÚPLNÚBCTÚVRLÚPHGÚSYNZUNAZESTZPATZSTSZGSSZHTGZHTCÚENVZCONZTSArŸ   rš   r›   r    )ÚFUNÚHUMÚMUSÚPROÚUNCÚXXXr“   )rp   rq   rW   r;   )rr   ÚdivisionZembl_to_gbkr   r   r   Ú_get_data_divisionf  s$    
-
z GenBankWriter._get_data_divisionc             C   s>   t dƒ}| j|ddd}|r2t |ƒ|kr2| |¡S d| S dS )z>Set the topology to 'circular', 'linear' if defined (PRIVATE).ÚcircularÚtopologyr9   )rs   rK   N)rW   rv   rˆ   )r   rr   Zmax_topology_lenr®   r   r   r   Ú_get_topology¾  s
    
zGenBankWriter._get_topologyc       
   	   C   s>  |j }|r|dkr|j}|r$|dkr4| j|ddd}t|ƒdkrlt|ƒd ttt|ƒƒƒ dkrlt d	t¡ t| ¡ ƒdkrˆt	d
| ƒ‚t|ƒdkr t dt¡ |  |dd¡}|dkr¾t	dƒ‚|rt|ƒdkr| 
dd¡ 
dd¡}t|ƒdkrt d| t¡ d}|dkrd}|dkr&d}nd}|  |¡}|  |¡}t|ƒdkr„ttt|ƒƒƒdt|ƒd  kr„|d tt|ƒƒ }nNtt|ƒƒ d¡}||t|ƒd…  }t|ƒdksÀt|ƒ‚d|ksÒt|ƒ‚t|ƒdksät‚t|ƒdksöt‚d||| d¡|||  |¡f }t|ƒdkr | ¡ }	|	d dkrFt	d | ƒ‚|	d!  ¡ dksd|	d!  ¡  ¡ ksd"|	d!  ¡  ¡ kst	d#| ƒ‚| j |¡ nšt|ƒdksºtt|ƒƒ‚|d$d%…  ¡ |tt|ƒƒgksät|ƒ‚|d%d&… d'krt	d | ƒ‚|d&d(… d)kr t	d*| ƒ‚|d(d+…  ¡ dksvd|d(d+…  ¡  ¡ ksvd"|d(d+…  ¡  ¡ ksvt	d#| ƒ‚|d+d,… dkr”t	d-| ƒ‚|d,d.…  ¡ d/kr¶t	d0| ƒ‚|d.d1… dkrÔt	d2| ƒ‚|d3d4… dkròt	d5| ƒ‚|d6d7… d8krt	d9| ƒ‚|d:d;… d8kr.t	d<| ƒ‚| j |¡ dS )=zWrite the LOCUS line (PRIVATE).z<unknown name>z<unknown id>Ú	accessionT)rt   é   r   é   zoIncreasing length of locus line to allow long name. This will result in fields that are not in usual positions.z'Invalid whitespace in %r for LOCUS linel   ÿgí] zThe sequence length is very long. The LOCUS line will be increased in length to compensate. This may cause unexpected behavior.Úmolecule_typeNz$missing molecule_type in annotationsé   zunassigned r9   zgenomic zMolecule type %r too longÚDNA)ÚproteinÚPROTEINÚaaÚbpr‘   rK   r•   r“   z!LOCUS       %s %s    %s %s %s %s
rI   )r¹   r¸   z=LOCUS line does not contain size units at expected position:
é   ÚRNAzALOCUS line does not contain valid sequence type (DNA, RNA, ...):
r„   é(   é,   )z bp z aa é/   )z   zss-zds-zms-zCLOCUS line does not have valid strand type (Single stranded, ...):
é6   é7   z2LOCUS line does not contain space at position 55:
é?   )r9   Zlinearr­   zALOCUS line does not contain valid entry (linear, circular, ...):
é@   z2LOCUS line does not contain space at position 64:
éC   éD   z2LOCUS line does not contain space at position 68:
éF   éG   ú-z6LOCUS line does not contain - at position 71 in date:
éJ   éK   z6LOCUS line does not contain - at position 75 in date:
)ÚnameÚidrv   rW   rS   rc   rd   r   rx   r7   rT   r¯   r¬   Úrjustr;   rˆ   r—   rw   r–   r   rP   Úrepr)
r   rr   ZlocusÚmol_typeÚunitsr®   r«   Zname_lengthr_   Z	splitliner   r   r   Ú_write_the_first_lineÈ  s¾     



0	*




z#GenBankWriter._write_the_first_linec             C   s6  d}x*|j d D ]}t|tjƒs&q|d7 }t|ƒ}|jr’t|jƒdkr’|j  d¡}|rhd|krhd}nd}|d||jd jd |jd j	f 7 }|  
d	|¡ |jr²|  d
|j¡ |jrÆ|  d|j¡ |jrÚ|  d|j¡ |jrî|  d|j¡ |jr|  d|j¡ |jr|  d|j¡ |jr|  d|j¡ qW d S )Nr   Ú
referencesr   r³   r¶   ZresiduesÚbasesz  (%s %i to %i)Z	REFERENCEz	  AUTHORSz	  CONSRTMz  TITLEz	  JOURNALz	  MEDLINEz	   PUBMEDz  REMARK)rp   r1   r   Ú	ReferencerS   r@   rW   r€   r?   r>   rŠ   ÚauthorsrŒ   ÚconsrtmÚtitleÚjournalZ
medline_idÚ	pubmed_idÚcomment)r   rr   rM   r:   Údatar³   rÏ   r   r   r   Ú_write_referencesg  s<    zGenBankWriter._write_referencesc             C   st  g }d|j krä|j d }d}xB| ¡ D ]6\}}x,| ¡ D ] \}}t|ƒ|krTt|ƒn|}q8W q&W x‚| ¡ D ]v\}}| d|› | j› ¡ x@| ¡ D ]4\}}d|t|ƒ  }	| |› |	› | j› |› ¡ q’W | d|› | j› ¡ qjW d|j kr<|j d }t|tƒr|| 	d¡7 }n&t|t
tfƒr4|t
|ƒ7 }ntdƒ‚|  d|d ¡ x"|d	d … D ]}
|  d
|
¡ qZW d S )NÚstructured_commentr   z##rK   rÙ   rO   z'Could not understand comment annotationÚCOMMENTr   r9   )rp   ri   rW   rz   ÚSTRUCTURED_COMMENT_STARTÚSTRUCTURED_COMMENT_DELIMÚSTRUCTURED_COMMENT_ENDr1   rS   rx   rj   rk   r7   rŒ   )r   rr   r‹   rÙ   Zpaddingr]   rÚ   ZsubkeyZsubdataZspacesr_   r   r   r   Ú_write_comment“  s0    

 
zGenBankWriter._write_commentc             C   sN   | j | j }|  ||¡}|  d|d ¡ x |dd … D ]}|  d|¡ q6W d S )NZCONTIGr   r   r9   )rX   r‡   r   rŠ   )r   rr   r|   r‹   r{   r   r   r   Ú_write_contig·  s
    zGenBankWriter._write_contigc          	   C   s  t |jtƒrd }n&yt|ƒ}W n tk
r6   d }Y nX |d krfd|jkrV|  |¡ n| j d¡ d S | 	¡ }t
|ƒ}| j d¡ x|td|| jƒD ]j}| j t|d ƒ | j¡¡ x<t|t|| j |ƒdƒD ] }| j d|||d …  ¡ qÌW | j d¡ q’W d S )Nr   zORIGIN
r   r   é
   z %srO   )r1   Úseqr
   r   r	   rp   râ   r   rP   ÚlowerrW   rZ   ÚLETTERS_PER_LINErS   rÌ   ÚSEQUENCE_INDENTr[   )r   rr   rÚ   Úseq_lenÚline_numberr}   r   r   r   Ú_write_sequence¾  s(    

 zGenBankWriter._write_sequencec             C   sd  | j }|  |¡ |j}| d¡dkrP|| d¡d d…  ¡ rP|j dd¡d }| j|d|dd}|}|j |d ¡r®y"d|t	|j dd¡d ƒf }W n t
k
r¬   Y nX | j|d	dd}|j}|d
krÐd}|d7 }|  d|¡ |  d|¡ |dkr|  dd||f ¡ n|  dd| ¡ g }x0|jD ]&}	d|	krF|	 dd¡}	| |	¡ q,W |  d|¡ ~y(d |jd ¡}
|
 d¡sŠ|
d7 }
W n tk
r¦   d}
Y nX |  d|
¡ d|jkr |jd }t|tƒrôt|ƒdksìt|ƒ‚|d }|  d|¡ |  d|  |d¡¡ |  |d¡}t|ƒ| j| j krR|d| j| j d … d }|  d|¡ y(d |jd ¡}| d¡s„|d7 }W n tk
r    d}Y nX |  d|¡ d |jkrä|jd  }t|tƒrØ|d }|  d!|¡ d"|jkrú|  |¡ d#|jksd$|jkr|  |¡ | d%¡ t|ƒ}x|jD ]}|   ||¡ q6W |  !|¡ | d&¡ dS )'z)Write a single record to the output file.ro   r   Nr   r°   T)rt   z%s.%iÚgiz<unknown description>r9   Z
DEFINITIONZ	ACCESSIONZVERSIONz	%s  GI:%sz%sz: ú:ZDBLINKz; ÚkeywordsZKEYWORDSÚsegmentZSEGMENTZSOURCEr   Úorganismrº   z...z
  ORGANISMÚtaxonomyÚ	db_sourceZDBSOURCErÑ   rÙ   rÜ   z)FEATURES             Location/Qualifiers
z//
)"r   rÐ   rË   Úcountr`   Úisdigitrx   rv   Ú
startswithrU   r7   ÚdescriptionrŒ   rŠ   ÚdbxrefsrT   rz   r   r5   rp   Úendswithrq   r1   rj   rW   r;   rX   r‡   rÛ   rá   rP   Úfeaturesrn   rê   )r   rr   r   rs   r°   Zacc_with_versionrë   ÚdescrZdbxrefs_with_spaceÚxrí   rî   Zorgrð   rñ   rA   rl   r   r   r   Úwrite_recordß  s’    
(









zGenBankWriter.write_recordN)r    r!   r"   r#   r‡   r\   rÞ   rà   rß   ræ   rç   rŠ   rŒ   r   r‚   r—   r¬   r¯   rÐ   rÛ   rá   râ   rê   rû   r   r   r   r   rƒ     s*   	#X
  ,$!rƒ   c               @   s˜   e Zd ZdZdZdZdded   ZdZdZd	Z	d
Z
e	e
 Zd	Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zedd„ ƒZdd„ Zdd„ Zdd„ Zdd„ ZdS ) Ú
EmblWriterzEMBL writer.é   rJ   ÚFTrK   r•   zFT   %s                z,FH   Key             Location/Qualifiers
FH
rã   r”   c             C   s6   | j | j }|  ||¡}x|D ]}|  d|¡ qW d S )NZCO)rX   r‡   r   rŠ   )r   rr   r|   r‹   r{   r   r   r   râ   t  s    
zEmblWriter._write_contigc          	   C   sJ  | j }t|jtƒrd }n&yt|ƒ}W n tk
r<   d }Y nX |d krjd|jkr\|  |¡ n
| d¡ d S | 	¡ }t
|ƒ}|j d¡}|d k	rd|kr| d¡| d¡ }| d¡| d¡ }| d	¡| d
¡ }| d¡| d¡ }	||| | |	  }
| d|||||	|
f ¡ n
| d¡ x”td|| j ƒD ]€}| d¡ xDt| jƒD ]6}| j| | j|  }| d|||| j …  ¡ qPW | t|d | j ƒ | j¡¡ | d¡ q6W || j rF|| j }| d¡ xJt| jƒD ]<}| j| | j|  }| d|||| j …   d¡¡ qæW | t|ƒ | j¡¡ | d¡ d S )Nr   zSQ   
r³   rµ   ÚAÚaÚCÚcÚGÚgÚTr   z7SQ   Sequence %i BP; %i A; %i C; %i G; %i T; %i other;
r   z    z %sr   rO   r‘   )r   r1   rä   r
   r   r	   rp   râ   rP   rå   rW   r€   rò   rZ   ræ   ÚBLOCKS_PER_LINEÚLETTERS_PER_BLOCKrS   rÌ   ÚPOSITION_PADDINGrˆ   )r   rr   r   rÚ   rè   r³   Za_countZc_countZg_countZt_countÚotherré   Úblockr`   r   r   r   rê   z  sV    




"


$zEmblWriter._write_sequencec             C   sN   t |ƒdkst‚|d | }t |ƒ| jkr:t d| t¡ | j |d ¡ d S )Nr•   z   zLine %r too longrO   )rW   r;   rX   rc   rd   r   r   rP   )r   r‰   r{   r_   r   r   r   rŠ   ½  s
    zEmblWriter._write_single_linec             C   s6   | j | j }|  ||¡}x|D ]}|  ||¡ qW d S )N)rX   r‡   r~   rŠ   )r   r‰   r{   r|   r‹   r_   r   r   r   rŒ   Ä  s    
zEmblWriter._write_multi_linec       
      C   s|  d|j krX|j  dd¡d  ¡ rXd|j  dd¡d  }| j|d|j  dd¡d dd}nd}| j|d|j dd}d	|kr„td
| ƒ‚d|kr˜td| ƒ‚| j|ddd}|j d¡}|dkrÄtdƒ‚|dkrÜt d| t	¡ | 
¡ }d|kròd}n0d|krd}n d|krd}d}ntd| ƒ‚|  |¡}| j}	|  dd|||||t|ƒ|f ¡ |	 d¡ |  d|d	 ¡ |	 d¡ dS )z$Write the ID and AC lines (PRIVATE).ro   r   zSV r°   r   T)rt   r9   ú;z.Cannot have semi-colon in EMBL accession, '%s'rK   z*Cannot have spaces in EMBL accession, '%s'r®   )rs   r³   Nz$missing molecule_type in annotations)rµ   r»   r¶   zNon-standard molecule type: %srµ   ZBPr»   r·   ZAAz'failed to understand molecule_type '%s'ZIDz%s; %s; %s; %s; ; %s; %i %s.zXX
ZAC)rË   Úrsplitró   rv   r7   rp   r€   rc   rd   r   r–   r¬   r   rŠ   rW   rP   )
r   rr   Úversionr°   r®   rÎ   Zmol_type_upperrÏ   r«   r   r   r   r   Ú_write_the_first_linesÊ  sH     



	
z!EmblWriter._write_the_first_linesc             C   sv   y| j d }W n tk
r&   d}Y nX |dkr2n0dddœ}y|| }W n tk
r`   d}Y nX t|ƒdksrt‚|S )Nr˜   r©   )r¢   r¤   r¥   r¦   rž   rœ   r   r§   rŸ   r¨   r›   r£   ZTGNr©   r¡   rª   r¨   )r    r™   r“   )rp   rq   rW   r;   )rr   r«   Zgbk_to_emblr   r   r   r¬   
  s    
&

zEmblWriter._get_data_divisionc             C   s0   x|j d D ]}|  d|¡ qW | j d¡ d S )Nrí   ZKWzXX
)rp   rŠ   r   rP   )r   rr   Úkeywordr   r   r   Ú_write_keywordsE  s    zEmblWriter._write_keywordsc             C   sþ   d}xô|j d D ]æ}t|tjƒs"q|d7 }|  dd| ¡ |jrvt|jƒdkrv|  dd|jd jd |jd jf ¡ |j	rŽ|  dd	|j	 ¡ |j
r¦|  d
d|j
 ¡ |jr¾|  d|jd ¡ |jrÖ|  dd|j ¡ |jrê|  d|j¡ | j d¡ qW d S )Nr   rÑ   r   ZRNz[%i]ZRPz%i-%iZRXzPUBMED; %s.ZRGz%sZRAr  ZRTz"%s";ZRLzXX
)rp   r1   r   rÓ   rŠ   r@   rW   r?   r>   rØ   rÕ   rÔ   rŒ   rÖ   r×   r   rP   )r   rr   rM   r:   r   r   r   rÛ   N  s,     zEmblWriter._write_referencesc             C   sn   |j d }t|tƒr | d¡}nt|ttfƒr4|}ntdƒ‚|sDd S x|D ]}|  d|¡ qJW | j 	d¡ d S )NrÙ   rO   z'Could not understand comment annotationZCCzXX
)
rp   r1   rS   rx   rj   rk   r7   rŒ   r   rP   )r   rr   rÙ   r‹   r_   r   r   r   rá   o  s    


zEmblWriter._write_commentc             C   s  | j }|  |¡ xft|jƒD ]X}| d¡rN|  d|dd… d ¡ | d¡ P | d¡r|  d|d ¡ | d¡ P qW |j}|dkrŠd	}|  d
|¡ | d¡ d|j	kr´|  
|¡ |  d|  |d¡¡ yd |j	d ¡d	 }W n tk
rø   d	}Y nX |  d|¡ | d¡ d|j	kr&|  |¡ d|j	kr<|  |¡ | | j¡ t|ƒ}x|jD ]}|  ||¡ qXW | d¡ |  |¡ | d¡ dS )z)Write a single record to the output file.zBioProject:ZPRr“   Nr  zXX
zProject:z<unknown description>ro   ZDErí   ZOSrï   z; rð   ZOCrÑ   rÙ   z//
)r   r  Úsortedrö   rô   rŠ   rP   rõ   rŒ   rp   r  rv   r5   rq   rÛ   rá   ÚFEATURE_HEADERrW   rø   rn   rê   )r   rr   r   Zxrefrù   rð   rA   rl   r   r   r   rû   „  sH    













zEmblWriter.write_recordN)r    r!   r"   r#   r‡   r\   rQ   rh   r  r  r  ræ   r  râ   rê   rŠ   rŒ   r  r‚   r¬   r  rÛ   rá   rû   r   r   r   r   rü   e  s(   C@;	!rü   c               @   s0   e Zd ZdZdZdZdded   ZdZdZd	S )
Ú
ImgtWriterz"IMGT writer (EMBL format variant).rý   é   rþ   rK   r•   zFT   %s                    z0FH   Key                 Location/Qualifiers
FH
N)	r    r!   r"   r#   r‡   r\   rQ   rh   r  r   r   r   r   r  Ì  s   r  c             C   s   t ƒ j| dd}t ||d¡S )z Fast GenBank to FASTA (PRIVATE).F)Údo_featuresÚfasta)r   r   r   rP   )Úin_fileÚout_filer   r   r   r   Ú_genbank_convert_fastaÖ  s    r  c             C   s   t ƒ j| dd}t ||d¡S )zFast EMBL to FASTA (PRIVATE).F)r  r  )r   r   r   rP   )r  r  r   r   r   r   Ú_embl_convert_fastaÝ  s    r  Ú__main__)Úrun_doctest)Úverbose)r   )#r#   rc   r   ZBior   r   r   ZBio.GenBank.Scannerr   r   r   ZBio.Seqr	   r
   Z
Interfacesr   r   r   r   r%   r'   r(   r*   r,   rB   rD   rH   rƒ   rü   r  r  r  r    Z
Bio._utilsr  r   r   r   r   Ú<module>   sJ   06
>) 1    X  i

