B
    bc                 @   sd   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 dZdZdddZG dd dZd	S )zBio.SeqIO support for the "uniprot-xml" file format.

See Also:
http://www.uniprot.org

The UniProt XML format essentially replaces the old plain text file format
originally introduced by SwissProt ("swiss" format in Bio.SeqIO).

    )ElementTree)errors)
SeqFeature)Seq)	SeqRecordz{http://uniprot.org/uniprot}z4%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)NFc          
   c   s   |dk	rt dyPxJtj| ddD ]8\}}|dkr"|jtd kr"t||d V  |  q"W W nR tjk
r } z2t	j
|j t	jkr|jdkstt d	dn W dd}~X Y nX dS )
a  Iterate over UniProt XML as SeqRecord objects.

    parses an XML entry at a time from any UniProt XML file
    returns a SeqRecord for each iteration

    This generator can be used in Bio.SeqIO

    Argument source is a file-like object or a path to a file.

    Optional argument alphabet should not be used anymore.

    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
    Nz,The alphabet argument is no longer supported)startend)Zeventsr   entry)return_raw_comments)   r   zEmpty file.)
ValueErrorr   Z	iterparsetagNSParserparseclearZ
ParseErrorr   ZmessagescodeZXML_ERROR_NO_ELEMENTSpositionAssertionError)sourcealphabetr
   ZeventelemZ	exception r   2lib/python3.7/site-packages/Bio/SeqIO/UniprotIO.pyUniprotIterator   s    r   c               @   s"   e Zd ZdZdddZdd ZdS )	r   zParse a UniProt XML entry to a SeqRecord.

    Optional argument alphabet is no longer used.

    return_raw_comments=True to get back the complete comment field in XML format
    NFc             C   s    |dk	rt d|| _|| _dS )zInitialize the class.Nz,The alphabet argument is no longer supported)r   r	   r
   )selfr   r   r
   r   r   r   __init__D   s    zParser.__init__c                s.  j jtd kstfddfdd}fdd}fdd	}fd
d}fdd}fdd}fdd}fdd}fdd}	fdd}
fdd}d:dd  fdd}fdd }fd!d"}fd#d$}td%d%d&_j jd'd(_xBj j	 D ]2\}}|d)kr>t
|jj|< n|jj|< qW xj D ]}|jtd* kr||| qZ|jtd+ kr|| qZ|jtd, kr|| qZ|jtd- kr|| qZ|jtd. kr|| qZ|jtd/ kr|| n|jtd0 kr || n|jtd1 kr:|| n|jtd2 krT|	| n|jtd3 krn|
| n|jtd4 kr|| nh|jtd5 kr|| nN|jtd6 kr|| n4|jtd7 kr|| n|jtd8 krZ|| n qZW ttjjj_jjs(jjd9 d j_jS );zParse the input.r	   c                s>   |  j jkrg  j j| < | j j|  kr: j j|  | d S )N)ParsedSeqRecordannotationsappend)keyvalue)r   r   r   append_to_annotationsO   s    z+Parser.parse.<locals>.append_to_annotationsc                s(   | j  j_ jj jd | j   d S )N:)textr   namedbxrefsr   dbname)element)r   r   r   _parse_nameU   s    
z!Parser.parse.<locals>._parse_namec                s*    d| j  jjjd | j   d S )N
accessionsr#   )r$   r   r&   r   r'   )r(   )r"   r   r   r   _parse_accessionY   s    
z&Parser.parse.<locals>._parse_accessionc                s   d}x| D ]}|j td td td gkrxx|D ]P}d|j td|j tdf } ||j |j td kr0|s0|jj_d}q0W q
|j td	 krq
|j td
 kr
q
W dS )zParse protein names (PRIVATE).FZrecommendedNameZsubmittedNameZalternativeNamez%s_%s ZfullNameTZ	componentdomainN)r   r   replacer$   r   Zdescription)r(   Z	descr_setZprotein_elementZrec_nameann_key)r"   r   r   r   _parse_protein_   s$    



z$Parser.parse.<locals>._parse_proteinc                sb   x\| D ]T}d|j krd|jtd|j d f }|j d dkrN|jjj|< q ||j qW d S )Ntypez
gene_%s_%sr,   Zprimary)attribr   r.   r   r$   r   r   )r(   Zgenename_elementr/   )r"   r   r   r   _parse_genew   s    

z!Parser.parse.<locals>._parse_genec                s    d| j d  d S )NgeneLocationr1   )r2   )r(   )r"   r   r   _parse_geneLocation   s    z)Parser.parse.<locals>._parse_geneLocationc                s  d } }}x| D ]}|j td krd|jr|jd dkr@|j}q|jd dkrV|j}q̈ d|j q|j td krjj|jd d |jd	   q|j td
 krx&|D ]}|j td kr d|j qW qW |r|rd||f }n|r|}n|r|}|jjd< d S )Nr,   r%   r1   Z
scientificcommonorganism_namedbReferencer#   idZlineageZtaxonZtaxonomyz%s (%s)organism)r   r   r$   r2   r   r&   r   r   )r(   r7   Zcom_nameZsci_nameorganism_elementZtaxon_element)r"   r   r   r   _parse_organism   s0    

z%Parser.parse.<locals>._parse_organismc                s,   x&| D ]}|j td kr d|j qW d S )Nr%   Zorganism_host)r   r   r$   )r(   r;   )r"   r   r   _parse_organismHost   s    
z)Parser.parse.<locals>._parse_organismHostc                s    d| j  d S )Nkeywords)r$   )r(   )r"   r   r   _parse_keyword   s    z$Parser.parse.<locals>._parse_keywordc                s  ddddddddd	d
dddddddddddddg}| j d |krd| j d dd }x(| td D ]}|jrf ||j qfW n| j d dkrxZ| td D ]H}xB|D ]:}|jrd| j d dd|jtdf } ||j qW qW nx| j d d kr>x6| td! D ]$}d"| j d  } ||j d#  qW n,| j d d$krxT| td% D ]B}d&| j d dd }x$|td' D ]} ||j qW q^W n| j d d(krd| j d dd }d) }	}
x| td* D ]}t|td+ }yf|r t|d) j d+ }
|
d, }	n@tt|td- j d+ }	|	d,8 }	tt|td. j d+ }
W n t	t
fk
r|   Y nX qW | j d/ }| j d0 }|	|
  krd)krn n |d1||f  n |d2|	|
||f  n| j d d3krnx| j d d4krjxf| td5 D ]T}d| j d dd }x6|td5 D ]$} |d6| j d7 |j d8 f  q<W qW jrd9| j d dd } |t|  d:S );ac  Parse comments (PRIVATE).

            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
            To store all the contained data, more complex data structures are needed, such as
            annotated dictionaries. This is left to end user, by optionally setting:

            return_raw_comments=True

            The original XML is returned in the annotation fields.

            Available comment types at december 2009:
             - "allergen"
             - "alternative products"
             - "biotechnology"
             - "biophysicochemical properties"
             - "catalytic activity"
             - "caution"
             - "cofactor"
             - "developmental stage"
             - "disease"
             - "domain"
             - "disruption phenotype"
             - "enzyme regulation"
             - "function"
             - "induction"
             - "miscellaneous"
             - "pathway"
             - "pharmaceutical"
             - "polymorphism"
             - "PTM"
             - "RNA editing"
             - "similarity"
             - "subcellular location"
             - "sequence caution"
             - "subunit"
             - "tissue specificity"
             - "toxic dose"
             - "online information"
             - "mass spectrometry"
             - "interaction"

            ZallergenZbiotechnologyzbiophysicochemical propertieszcatalytic activityZcautionZcofactorzdevelopmental stageZdiseaser-   zdisruption phenotypezenzyme regulationZfunctionZ	inductionZmiscellaneousZpathwayZpharmaceuticalZpolymorphismZPTMzRNA editingZ
similarityZsubunitztissue specificityz
toxic doser1   z
comment_%s r,   r$   zsubcellular locationZsubcellularLocationzcomment_%s_%sZinteractionZinteractantzcomment_%s_intactIdZintactIdzalternative productsZisoformzcomment_%s_isoformr9   zmass spectrometryr   locationr   r   beginr   massmethodzundefined:%s|%sz%s..%s:%s|%szsequence cautionzonline informationlinkz%s@%sr%   Zurizcomment_%s_xmlN)r2   r.   iterr   r$   r   listintnextr   KeyErrorr
   r   Ztostring)r(   Zsimple_commentsr/   Ztext_elementZsubloc_elementZelZinteract_elementZalt_elementZ
id_elementr   r   Zpos_elsrC   rD   Zlink_element)r"   r   r   r   _parse_comment   s    ,


 

$z$Parser.parse.<locals>._parse_commentc                s   j j| jd d | jd   d| jkr^| jd dkr^d}d}x| D ]
}|jtd krN|jd }|dkr~|jd }|d	kr|jd }|d
krN|jd d}x|D ]}| d}|d dkrt }| jd |_	| jd |j
d< ||j
d< ||j
d	< |d d|j
d
< t|d dd d }	t|d dd }
t|	|
|_qW qNW x| D ]}|jtd krdqdW d S )Nr1   r#   r9   ZPDBr,   propertyrD   r!   
resolutionZchains,=r   -r%   r   /)r   r&   r   r2   r   r   splitstripr   r1   
qualifiersrH   FeatureLocationrA   )r(   rD   rM   ref_elementZdat_typeZpairsr   Zpairfeaturer   r   )r   r   r   _parse_dbReference8  sB    






z(Parser.parse.<locals>._parse_dbReferencec                sl  t  }g }g }g }d}d}d}x| D ]}|jtd krl|jd }|dkrb|d|jd  7 }d|jkrv|jd }|jdd}|jd	d}	|jd
d}
|jdd}x|D ]}|jtd kr|j|_q|jtd krx|D ]}||jd  qW q|jtd krj	j
|jd d |jd   |jd dkrL|jd |_q|jd dkr|jd |_qW q(|jtd kr||j q(|jtd kr(x*|D ]"}|jtd kr||j qW q(W |rdd| }nd}|rdd| }nd}g |_d||_|rJ|rD|	rD|
rD|rDt||	|
||d |_n||_d||||f|_ d| d S )Nr,   Zcitationr1   Z
submissionz to the Zdbr%   ZdatevolumefirstlasttitleZ
authorListr8   r#   r9   ZPubMedZMEDLINEZscoper   ZtissuezScope: z, zTissue: )r%   rY   rZ   r[   pub_datez | Z
references)r   Z	Referencer   r   r2   getr$   r\   r   r   r&   Z	pubmed_idZ
medline_idjoinrA   authorsREFERENCE_JOURNALZjournalcomment)r(   	referencer`   ZscopesZtissuesZjournal_nameZpub_typer]   rV   Zj_volumeZj_firstZj_lastZcit_elementZperson_elementZsource_elementZ
scopes_strZtissues_str)r"   r   r   r   _parse_referenceh  sr    






z&Parser.parse.<locals>._parse_referencer   c             S   s   yt | jd | }W n tk
r.   d }Y nX | jdd}|dkrZ|d ksRtt S |sht|S |dkrzt|S |dkrt	|S |dkrt
|S td| d S )	Nr   statusr,   unknownzgreater thanz	less thanZ	uncertainzPosition status %r)rH   r2   rJ   r^   r   r   ZUnknownPositionZExactPositionZAfterPositionZBeforePositionZUncertainPositionNotImplementedError)r(   offsetr   re   r   r   r   _parse_position  s"    




z%Parser.parse.<locals>._parse_positionc          	      s$  t   }x | j D ]\}}||j|< qW | jdd|_d| jkrP| jd |_x| D ]}|jtd kr|	td }|r|d }  | d} | }n6|	td d }  | d}|	td	 d }  | }t 
|||_qVy|j|j|jtd< W qV tk
r   Y qVX qVW jj| d S )
Nr1   r,   r9   rA   r   r   rB   r   )r   r2   itemsrT   r^   r1   r9   r   r   findallrU   rA   r$   r.   	Exceptionr   Zfeaturesr   )r(   rW   kvZfeature_elementZposition_elementsZstart_positionZend_position)ri   r   r   r   _parse_feature  s0    





z$Parser.parse.<locals>._parse_featurec                s    d| j d  d S )NproteinExistencer1   )r2   )r(   )r"   r   r   _parse_proteinExistence  s    z-Parser.parse.<locals>._parse_proteinExistencec                s*   x$| j  D ]\}}|} || qW d S )N)r2   rk   )r(   rn   ro   r/   )r"   r   r   _parse_evidence  s    z%Parser.parse.<locals>._parse_evidencec                sn   xD| j  D ]6\}}|dkr2t| jjd| < q| jjd| < qW td| j  j_	d jjd< d S )N)lengthrC   versionzsequence_%sr,   proteinZmolecule_type)
r2   rk   rH   r   r   r   r_   r$   rR   seq)r(   rn   ro   )r   r   r   _parse_sequence  s    z%Parser.parse.<locals>._parse_sequencer,   )r9   ZdatasetZUnknownDatasetru   r%   Z	accessionrv   Zgener4   r:   ZorganismHostkeywordrb   r8   rc   rW   rq   ZevidenceZsequencer*   )r   )r	   r   r   r   r   r   r2   r^   r'   rk   rH   r   sortedsetr&   r9   )r   r)   r+   r0   r3   r5   r<   r=   r?   rK   rX   rd   rp   rr   rs   rx   rn   ro   r(   r   )ri   r"   r   r   r   K   sz      	0G












zParser.parse)NF)__name__
__module____qualname____doc__r   r   r   r   r   r   r   <   s   
r   )NF)r   Z	xml.etreer   Zxml.parsers.expatr   ZBior   ZBio.Seqr   ZBio.SeqRecordr   r   ra   r   r   r   r   r   r   <module>   s   
