B
    ‰°bä:  ã               @   s€  d Z ddlZddddddd	d
dddddddddgZddddddd	d
dddddddgZddddddd
dddddgZddddddd
dddddgZddddddddddd gZddddddd d!d"g	Zdddddddd d!d"g
Zd#d$„ Z	d%d&„ Z
d'd(„ Zd)d*„ Zd+d,„ Zd-d.„ Zd/d0„ Zd1d2„ Zd3d4„ Zd5d6„ Zd7d8„ Zd9d:„ Zd;d<„ Zefd=d>„Zefd?d@„ZdAdB„ ZedCkr|ddDlmZ eddE dS )Fa…  Parsers for the GAF, GPA and GPI formats from UniProt-GOA.

Uniprot-GOA README + GAF format description:
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README

Gene Association File, GAF formats:
http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/

Gene Product Association Data  (GPA format) README:
http://geneontology.org/docs/gene-product-association-data-gpad-format/

Gene Product Information (GPI format) README:
http://geneontology.org/docs/gene-product-information-gpi-format/

Go Annotation files are located here:
ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
é    NZDBÚDB_Object_IDZDB_Object_SymbolZ	QualifierZGO_IDzDB:ReferenceZEvidenceZWithZAspectZDB_Object_NameZSynonymZDB_Object_TypeZTaxon_IDZDateZAssigned_ByZAnnotation_ExtensionZGene_Product_Form_IDzEvidence codeZInteracting_taxon_IDZAssigned_byZSpliceform_IDZECO_Evidence_codezAnnotation ExtensionZAnnotation_PropertiesZ	DB_subsetZDB_Object_SynonymZTaxonZAnnotation_Target_SetZAnnotation_CompletedZParent_Object_IDZDB_XrefZGene_Product_Propertiesc             c   sr   xl| D ]d}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< ttt|ƒƒV  qW d	S )
z’Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    r   ú!Ú
ú	é   é   ú|é   N)ÚrstripÚsplitÚlenÚdictÚzipÚGPI10FIELDS)ÚhandleÚinlineÚinrec© r   ú.lib/python3.7/site-packages/Bio/UniProt/GOA.pyÚ_gpi10iterator•   s    
r   c             c   s–   x| D ]ˆ}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< ttt|ƒƒV  qW dS )z’Read GPI 1.1 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.1 format.
    r   r   r   r   r   é   r   é   é   r	   N)r
   r   r   r   r   ÚGPI11FIELDS)r   r   r   r   r   r   Ú_gpi11iterator¦   s    
r   c             c   s–   x| D ]ˆ}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< ttt|ƒƒV  qW dS )z’Read GPI 1.2 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.2 format.
    r   r   r   r   r   r   r   é   r	   é	   N)r
   r   r   r   r   ÚGPI12FIELDS)r   r   r   r   r   r   Ú_gpi12iterator¹   s    
r   c             C   sn   |   ¡ }| ¡ dkrt| ƒS | ¡ dkr0t| ƒS | ¡ dkrDt| ƒS | ¡ dkrZtdƒ‚ntd|› dƒ‚dS )	zèRead GPI format files.

    This function should be called to read a
    gp_information.goa_uniprot file. At the moment, there is
    only one format, but this may change, so
    this function is a placeholder a future wrapper.
    z!gpi-version: 1.2z!gpi-version: 1.1z!gpi-version: 1.0z!gpi-version: 2.1z1Sorry, parsing GPI version 2 not implemented yet.zUnknown GPI version r   N)ÚreadlineÚstripr   r   r   ÚNotImplementedErrorÚ
ValueError)r   r   r   r   r   Úgpi_iteratorÌ   s    
r#   c             c   s–   x| D ]ˆ}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< ttt|ƒƒV  qW dS )zÈRead GPA 1.0 format files (PRIVATE).

    This iterator is used to read a gp_association.*
    file which is in the GPA 1.0 format. Do not call directly. Rather,
    use the gpaiterator function.
    r   r   r   r   r   r   r   r   é   é
   N)r
   r   r   r   r   ÚGPA10FIELDS)r   r   r   r   r   r   Ú_gpa10iteratorå   s    
r'   c             c   s–   x| D ]ˆ}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< ttt|ƒƒV  qW dS )zÑRead GPA 1.1 format files (PRIVATE).

    This iterator is used to read a gp_association.goa_uniprot
    file which is in the GPA 1.1 format. Do not call directly. Rather
    use the gpa_iterator function
    r   r   r   r   r   r   r   r   r$   r%   N)r
   r   r   r   r   ÚGPA11FIELDS)r   r   r   r   r   r   Ú_gpa11iteratorù   s    
r)   c             C   sD   |   ¡ }| ¡ dkrt| ƒS | ¡ dkr0t| ƒS td|› dƒ‚dS )zÁRead GPA format files.

    This function should be called to read a
    gene_association.goa_uniprot file. Reads the first record and
    returns a gpa 1.1 or a gpa 1.0 iterator as needed
    z!gpa-version: 1.1z!gpa-version: 1.0zUnknown GPA version r   N)r   r    r)   r'   r"   )r   r   r   r   r   Úgpa_iterator  s    r*   c             c   s¨   x¢| D ]š}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< |d  d¡|d< ttt|ƒƒV  qW d S )Nr   r   r   r   r   r   r   r   r   r%   é   )r
   r   r   r   r   ÚGAF20FIELDS)r   r   r   r   r   r   Ú_gaf20iterator  s    
r-   c             c   s¨   x¢| D ]š}|d dkrq|  d¡ d¡}t|ƒdkr6q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< |d  d¡|d< ttt|ƒƒV  qW d S )Nr   r   r   r   r   r   r   r   r   r%   r+   )r
   r   r   r   r   ÚGAF10FIELDS)r   r   r   r   r   r   Ú_gaf10iterator.  s    
r/   c             c   sð   d }g }xâ| D ]Ú}|d dkr q|  d¡ d¡}t|ƒdkr>q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< |d  d¡|d< ttt|ƒƒ}|d |krÖ|rÖt |¡}|g}|d }|V  q|d }| |¡ qW d S )Nr   r   r   r   r   r   r   r   r   r%   r+   r   )r
   r   r   r   r   r.   ÚcopyÚappend)r   Úcur_idÚid_rec_listr   r   Úcur_recÚret_listr   r   r   Ú_gaf10byproteiniterator=  s*    

r6   c             c   sð   d }g }xâ| D ]Ú}|d dkr q|  d¡ d¡}t|ƒdkr>q|d  d¡|d< |d  d¡|d< |d	  d¡|d	< |d
  d¡|d
< |d  d¡|d< ttt|ƒƒ}|d |krÖ|rÖt |¡}|g}|d }|V  q|d }| |¡ qW d S )Nr   r   r   r   r   r   r   r   r   r%   r+   r   )r
   r   r   r   r   r,   r0   r1   )r   r2   r3   r   r   r4   r5   r   r   r   Ú_gaf20byproteiniteratorV  s*    

r7   c             C   sX   |   ¡ }| ¡ dkrt| ƒS | ¡ dkr0t| ƒS | ¡ dkrDt| ƒS td|› dƒ‚dS )a¤  Iterate over records in a gene association file.

    Returns a list of all consecutive records with the same DB_Object_ID
    This function should be called to read a
    gene_association.goa_uniprot file. Reads the first record and
    returns a gaf 2.0 or a gaf 1.0 iterator as needed
    2016-04-09: added GAF 2.1 iterator & fixed bug in iterator assignment
    In the meantime GAF 2.1 uses the GAF 2.0 iterator
    z!gaf-version: 2.0z!gaf-version: 1.0z!gaf-version: 2.1zUnknown GAF version r   N)r   r    r7   r6   r"   )r   r   r   r   r   Úgafbyproteiniteratoro  s    
r8   c             C   sX   |   ¡ }| ¡ dkrt| ƒS | ¡ dkr0t| ƒS | ¡ dkrDt| ƒS td|› dƒ‚dS )a  Iterate over a GAF 1.0 or 2.0 file.

    This function should be called to read a
    gene_association.goa_uniprot file. Reads the first record and
    returns a gaf 2.0 or a gaf 1.0 iterator as needed

    Example: open, read, interat and filter results.

    Original data file has been trimed to ~600 rows.

    Original source ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/YEAST/goa_yeast.gaf.gz

    >>> from Bio.UniProt.GOA import gafiterator, record_has
    >>> Evidence = {'Evidence': set(['ND'])}
    >>> Synonym = {'Synonym': set(['YA19A_YEAST', 'YAL019W-A'])}
    >>> Taxon_ID = {'Taxon_ID': set(['taxon:559292'])}
    >>> with open('UniProt/goa_yeast.gaf', 'r') as handle:
    ...     for rec in gafiterator(handle):
    ...         if record_has(rec, Taxon_ID) and record_has(rec, Evidence) and record_has(rec, Synonym):
    ...             for key in ('DB_Object_Name', 'Evidence', 'Synonym', 'Taxon_ID'):
    ...                 print(rec[key])
    ...
    Putative uncharacterized protein YAL019W-A
    ND
    ['YA19A_YEAST', 'YAL019W-A']
    ['taxon:559292']
    Putative uncharacterized protein YAL019W-A
    ND
    ['YA19A_YEAST', 'YAL019W-A']
    ['taxon:559292']
    Putative uncharacterized protein YAL019W-A
    ND
    ['YA19A_YEAST', 'YAL019W-A']
    ['taxon:559292']

    z!gaf-version: 2.0z!gaf-version: 2.1z!gaf-version: 1.0zUnknown GAF version r   N)r   r    r-   r/   r"   )r   r   r   r   r   Úgafiteratorˆ  s    %r9   c             C   sŠ   d}xb|dd… D ]R}t | | tƒrTx| | D ]}||d 7 }q.W |dd… d }q|| | d 7 }qW || |d  d 7 }| |¡ dS )zÚWrite a single UniProt-GOA record to an output stream.

    Caller should know the  format version. Default: gaf-2.0
    If header has a value, then it is assumed this is the first record,
    a header is written.
    Ú Néÿÿÿÿr   r   r   )Ú
isinstanceÚlistÚwrite)Úoutrecr   ÚfieldsZoutstrÚfieldZsubfieldr   r   r   Úwriterec¼  s    rB   c             C   s    x| D ]}t |||d qW dS )aO  Write a list of GAF records to an output stream.

    Caller should know the  format version. Default: gaf-2.0
    If header has a value, then it is assumed this is the first record,
    a header is written. Typically the list is the one read by fafbyproteinrec, which
    contains all consecutive lines with the same DB_Object_ID
    )r@   N)rB   )Z
outprotrecr   r@   r?   r   r   r   ÚwritebyproteinrecÏ  s    
rC   c             C   sN   d}xD|D ]<}t | | tƒr(| | h}nt| | ƒ}||| @ r
d}P q
W |S )zÜAccept a record, and a dictionary of field values.

    The format is {'field_name': set([val1, val2])}.
    If any field in the record has  a matching value, the function returns
    True. Otherwise, returns False.
    FT)r<   ÚstrÚset)r   Z	fieldvalsZretvalrA   Zset1r   r   r   Ú
record_hasÛ  s    
rF   Ú__main__)Úrun_doctest)Úverbose)Ú__doc__r0   r,   r.   r&   r(   r   r   r   r   r   r   r#   r'   r)   r*   r-   r/   r6   r7   r8   r9   rB   rC   rF   Ú__name__Z
Bio._utilsrH   r   r   r   r   Ú<module>   sÔ   4
