B
    bk                 @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddl	mZ ddlmZ d	d
lmZ d	dlmZ G dd dejZG dd deZG dd deZdS )a  Bio.SeqIO support for the "seqxml" file format, SeqXML.

This module is for reading and writing SeqXML format files as
SeqRecord objects, and is expected to be used via the Bio.SeqIO API.

SeqXML is a lightweight XML format which is supposed be an alternative for
FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
(2011), https://doi.org/10.1093/bib/bbr025
    )sax)handler)XMLGenerator)AttributesImpl)Seq)
UnknownSeq)	SeqRecord   )SequenceIterator)SequenceWriterc                   s   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Z  ZS )&ContentHandlerz5Handles XML events generated by the parser (PRIVATE).c                s>   t    d| _d| _d| _d| _d| _d| _d| _g | _	dS )z&Create a handler to handle XML events.N)
super__init__sourcesourceVersionseqXMLversion	ncbiTaxIDspeciesNamestartElementNSdatarecords)self)	__class__ 1lib/python3.7/site-packages/Bio/SeqIO/SeqXmlIO.pyr       s    
zContentHandler.__init__c             C   s   | j | _dS )z2Set XML handlers when an XML declaration is found.N)startSeqXMLElementr   )r   r   r   r   startDocument,   s    zContentHandler.startDocumentc             C   s$  |dkrt d|dk	r tdd}x| D ]\}}|\}}|dkr|dkrV|| _q|dkrf|| _q|dkrv|| _q|dkrt|}	|| _q|d	kr|| _qt d
q.|dkr|dkr|}qt dq.t d| q.W | jdkrt dd| j }
||
krt d|| jf | j	| _
| j| _dS )z!Handle start of a seqXML element.)NseqXMLz*Failed to find the start of seqXML elementNz#Unexpected qname for seqXML elementr   r   r   r   r   z#Unexpected attribute for XML Schemaz)http://www.w3.org/2001/XMLSchema-instanceZnoNamespaceSchemaLocationz0Unexpected attribute for XML Schema in namespacez.Unexpected namespace '%s' for seqXML attributezFailed to find seqXMLversionz#http://www.seqxml.org/%s/seqxml.xsdzDXML Schema '%s' found not consistent with reported seqXML version %s)
ValueErrorRuntimeErroritemsr   r   r   intr   r   endSeqXMLElementendElementNSstartEntryElementr   )r   nameqnameattrsZschemakeyvalue	namespace	localnamenumberZurlr   r   r   r   0   sF    




z!ContentHandler.startSeqXMLElementc             C   sP   |\}}|dk	rt d| |dk	r0t d| |dkr@t dd| _d| _dS )z!Handle end of the seqXML element.Nz(Unexpected namespace '%s' for seqXML endz$Unexpected qname '%s' for seqXML endr   z$Failed to find end of seqXML element)r   r   r#   )r   r%   r&   r*   r+   r   r   r   r"   \   s    zContentHandler.endSeqXMLElementc       	      C   s   |dkrt d|dk	r tdtddd}| jdk	rB| j|jd< | jdk	rX| j|jd< | j|jd	< xb| D ]V\}}|\}}|dkr|d
kr||_q|d	kr||jd	< qt d| qnt d| qnW |jdkrt d| j	
| | j| _| j| _dS )z>Set new entry with id and the optional entry source (PRIVATE).)Nentryz.Expected to find the start of an entry elementNz"Unexpected qname for entry element )idorganism
ncbi_taxidr   r/   z(Unexpected attribute %s in entry elementz-Unexpected namespace '%s' for entry attributezFailed to find entry ID)r   r   r   r   annotationsr   r   r    r/   r   appendstartEntryFieldElementr   endEntryElementr#   )	r   r%   r&   r'   recordr(   r)   r*   r+   r   r   r   r$   h   s4    


z ContentHandler.startEntryElementc             C   s4   |dkrt d|dk	r td| j| _| j| _dS )zHandle end of an entry element.)Nr-   z,Expected to find the end of an entry elementNz"Unexpected qname for entry element)r   r   r$   r   r"   r#   )r   r%   r&   r   r   r   r5      s    zContentHandler.endEntryElementc             C   s   |\}}|dk	r t d||f |dk	r8td||f |dkrJ| |S |dkr\| |S |dkrn| |S |dkr| |S |dkr| |S t d	| dS )
z3Receive a field of an entry element and forward it.Nz(Unexpected namespace '%s' for %s elementz$Unexpected qname '%s' for %s elementspeciesdescription)DNAseqRNAseqAAseqDBRefpropertyzUnexpected field %s in entry)r   r   startSpeciesElementstartDescriptionElementstartSequenceElementstartDBRefElementstartPropertyElement)r   r%   r&   r'   r*   r+   r   r   r   r4      s$    




z%ContentHandler.startEntryFieldElementc       
      C   s   d}d}xb|  D ]V\}}|\}}|dkr\|dkr8|}qh|dkrNt|}|}qhtd| qtd| qW |dkr|td|dkrtd| jd }	||	jd	< ||	jd
< | j| _dS )zParse the species information.Nr%   r   z.Unexpected attribute '%s' found in species tagz/Unexpected namespace '%s' for species attributezFailed to find species namezFailed to find ncbiTaxIdr0   r1   )r    r!   r   r   r2   endSpeciesElementr#   )
r   r'   r%   r   r(   r)   r*   r+   r,   r6   r   r   r   r>      s,    


z"ContentHandler.startSpeciesElementc             C   sL   |\}}|dk	rt d| |dk	r0t d| |dkr@t d| j| _dS )z Handle end of a species element.Nz)Unexpected namespace '%s' for species endz%Unexpected qname '%s' for species endr7   z%Failed to find end of species element)r   r5   r#   )r   r%   r&   r*   r+   r   r   r   rD      s    z ContentHandler.endSpeciesElementc             C   s6   |rt d| jdk	r$td| j d| _| j| _dS )zParse the description.z2Unexpected attributes found in description elementNzUnexpected data found: '%s'r.   )r   r   r   endDescriptionElementr#   )r   r'   r   r   r   r?      s    
z&ContentHandler.startDescriptionElementc             C   sl   |\}}|dk	rt d| |dk	r0t d| |dkr@t d| jd }| j}|rZ||_d| _| j| _dS )z(Handle the end of a description element.Nz-Unexpected namespace '%s' for description endz)Unexpected qname '%s' for description endr8   z)Failed to find end of description elementrC   )r   r   r   r8   r5   r#   )r   r%   r&   r*   r+   r6   r8   r   r   r   rE      s    

z$ContentHandler.endDescriptionElementc             C   s6   |rt d| jdk	r$td| j d| _| j| _dS )z$Parse DNA, RNA, or protein sequence.z/Unexpected attributes found in sequence elementNzUnexpected data found: '%s'r.   )r   r   r   endSequenceElementr#   )r   r'   r   r   r   r@      s    
z#ContentHandler.startSequenceElementc             C   s   |\}}|dk	rt d| |dk	r0t d| | jd }|dkrNd|jd< n4|dkrbd	|jd< n |d
krvd|jd< nt d| t| j|_d| _| j| _dS )z%Handle the end of a sequence element.Nz*Unexpected namespace '%s' for sequence endz&Unexpected qname '%s' for sequence endrC   r9   DNAmolecule_typer:   RNAr;   proteinz/Failed to find end of sequence (localname = %s))r   r   r2   r   r   seqr5   r#   )r   r%   r&   r*   r+   r6   r   r   r   rF      s"    

z!ContentHandler.endSequenceElementc       
      C   s   d}d}xZ|  D ]N\}}|\}}|dkrT|dkr8|}q`|dkrF|}q`td| qtd| qW |dkrttd|dkrtd| jdk	rtd| j d	| _| jd
 }d||f }	|	|jkr|j|	 | j| _dS )z!Parse a database cross reference.Nr   r/   z1Unexpected attribute '%s' found for DBRef elementz-Unexpected namespace '%s' for DBRef attributez'Failed to find source for DBRef elementz#Failed to find id for DBRef elementzUnexpected data found: '%s'r.   rC   z%s:%s)	r    r   r   r   r   dbxrefsr3   endDBRefElementr#   )
r   r'   r   ZIDr(   r)   r*   r+   r6   dbxrefr   r   r   rA     s2    


z ContentHandler.startDBRefElementc             C   sj   |\}}|dk	rt d| |dk	r0t d| |dkrDt d| | jrXt d| j d| _| j| _dS )z"Handle the end of a DBRef element.Nz+Unexpected namespace '%s' for DBRef elementz'Unexpected qname '%s' for DBRef elementr<   z+Unexpected localname '%s' for DBRef elementz0Unexpected data received for DBRef element: '%s')r   r   r5   r#   )r   r%   r&   r*   r+   r   r   r   rM   0  s    

zContentHandler.endDBRefElementc       	      C   s   d}d}xX|  D ]L\}}|\}}|dkrR|dkr8|}q^|dkrF|}q^td|qtd| qW |dkrrtd| jd }|dkr|j| |kst||j|< n$||jkrg |j|< |j| | | j| _dS )	z'Handle the start of a property element.Nr%   r)   z4Unexpected attribute '%s' found for property elementz0Unexpected namespace '%s' for property attributez(Failed to find name for property elementrC   rH   )r    r   r   r2   AssertionErrorr3   endPropertyElementr#   )	r   r'   Zproperty_nameZproperty_valuer(   r)   r*   r+   r6   r   r   r   rB   D  s.    



z#ContentHandler.startPropertyElementc             C   sP   |\}}|dk	rt d| |dk	r0t d| |dkrDt d| | j| _dS )z%Handle the end of a property element.Nz.Unexpected namespace '%s' for property elementz*Unexpected qname '%s' for property elementr=   z.Unexpected localname '%s' for property element)r   r5   r#   )r   r%   r&   r*   r+   r   r   r   rP   g  s    

z!ContentHandler.endPropertyElementc             C   s   | j dk	r|  j |7  _ dS )zHandle character data.N)r   )r   r   r   r   r   
charactersv  s    
zContentHandler.characters)__name__
__module____qualname____doc__r   r   r   r"   r$   r5   r4   r>   rD   r?   rE   r@   rF   rA   rM   rB   rP   rQ   __classcell__r   r   )r   r   r      s&   ,!	!		!#r   c                   s6   e Zd ZdZdZd
 fdd	Zdd Zdd	 Z  ZS )SeqXmlIteratoraN  Parser for seqXML files.

    Parses seqXML files and creates SeqRecords.
    Assumes valid seqXML please validate beforehand.
    It is assumed that all information for one record can be found within a
    record element or above. Two types of methods are called when the start
    tag of an element is reached. To receive only the attributes of an
    element before its end tag is reached implement _attr_TAGNAME.
    To get an element and its children as a DOM tree implement _elem_TAGNAME.
    Everything that is part of the DOM tree will not trigger any further
    method calls.
    i   Nc                sB   t  | _t }| j| | jtjd t j	|ddd dS )z0Create the object and initialize the XML parser.TbZSeqXML)modeZfmtN)
r   Zmake_parserparserr   ZsetContentHandlerZ
setFeaturer   Zfeature_namespacesr   r   )r   Zstream_or_pathr*   content_handler)r   r   r   r     s
    
zSeqXmlIterator.__init__c             C   s   | j }| }| j}xH||}|s@|jdkr8tdntd|| |j}|dk	rP qW || _|j| _|j	| _	|j
| _
|j| _| |}|S )z9Start parsing the file, and return a SeqRecord generator.NzEmpty file.zXML file contains no data.)rZ   getContentHandlerBLOCKreadr   r   feedr   r   r   r   r   iterate)r   handlerZ   r[   r]   textr   r   r   r   r   parse  s(    




zSeqXmlIterator.parsec             c   st   | j }| }|j}| j}x:t|dkr8|d}|V  ||}|sHP || qW |E dH  |  |	  dS )z)Iterate over the records in the XML file.r	   r   N)
rZ   r\   r   r]   lenpopr^   r_   clearclose)r   ra   rZ   r[   r   r]   r6   rb   r   r   r   r`     s    


zSeqXmlIterator.iterate)N)	rR   rS   rT   rU   r]   r   rc   r`   rV   r   r   )r   r   rW   |  s
   rW   c                   sb   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Z  ZS )SeqXmlWriterzWrites SeqRecords into seqXML file.

    SeqXML requires the SeqRecord annotations to specify the molecule_type;
    the molecule type is required to contain the term "DNA", "RNA", or
    "protein".
    Nc                sF   t  |d | j}t|d| _| j  || _|| _|| _|| _	dS )a1  Create Object and start the xml generator.

        Arguments:
         - target - Output stream opened in binary mode, or a path to a file.
         - source - The source program/database of the file, for example
           UniProt.
         - source_version - The version or release number of the source
           program or database from which the data originated.
         - species - The scientific name of the species of origin of all
           entries in the file.
         - ncbiTaxId - The NCBI taxonomy identifier of the species of origin.

        wbzutf-8N)
r   r   ra   r   xml_generatorr   r   source_versionr7   	ncbiTaxId)r   targetr   rk   r7   rl   ra   )r   r   r   r     s    
zSeqXmlWriter.__init__c             C   s   dddd}| j dk	r | j |d< | jdk	r4| j|d< | jdk	r\t| jtsRtd| j|d	< | jdk	rt| jttfs~td
| j|d< | j	dt
| dS )z'Write root node with document metadata.z)http://www.w3.org/2001/XMLSchema-instancez$http://www.seqxml.org/0.4/seqxml.xsdz0.4)z	xmlns:xsizxsi:noNamespaceSchemaLocationr   Nr   r   z species should be of type stringr   z)ncbiTaxID should be of type string or intr   r   )r   rk   r7   
isinstancestr	TypeErrorrl   r!   rj   startElementr   )r   r'   r   r   r   write_header  s     







zSeqXmlWriter.write_headerc             C   s   |j r|j dkrtdt|j ts,tdd|j i}d|jkrv| j|jd krvt|jd tshtd|jd |d< | jdt	| | 
| | | | | | | | | | jd dS )	zWrite one record.z<unknown id>zSeqXML requires identifierz#Identifier should be of type stringr/   r   zsource should be of type stringr-   N)r/   r   rn   ro   rp   r2   r   rj   rq   r   _write_species_write_description
_write_seq_write_dbxrefs_write_properties
endElement)r   r6   Zattrbr   r   r   write_record  s"    






zSeqXmlWriter.write_recordc             C   s   | j d | j   dS )z0Close the root node and finish the XML document.r   N)rj   rx   ZendDocument)r   r   r   r   write_footer  s    zSeqXmlWriter.write_footerc             C   s   d}d|j krV|j d }t|trVt|dkr8|d }nt|dkrJd}ntd| d|j kr|r|j d }t|tstdt|ttfstd|| jks|| j	kr|t|d	}| j
d
t| | j
d
 dS )z%Write the species if given (PRIVATE).Nr1   r	   r   z9Multiple entries for record.annotations['ncbi_taxid'], %rr0   z!organism should be of type stringz)ncbiTaxID should be of type string or int)r%   r   r7   )r2   rn   listrd   r   ro   rp   r!   r7   rl   rj   rq   r   rx   )r   r6   Zlocal_ncbi_taxidZ	local_orgattrr   r   r   rs      s*    





zSeqXmlWriter._write_speciesc             C   sh   |j rdt|j tstd|j }|dkr,d}t|j dkrd| jdti  | j| | j	d dS )z)Write the description if given (PRIVATE).z$Description should be of type stringz<unknown description>r.   r   r8   N)
r8   rn   ro   rp   rd   rj   rq   r   rQ   rx   )r   r6   r8   r   r   r   rt   @  s    zSeqXmlWriter._write_descriptionc             C   s   t |jtrtdt|j}t|dks2td|jd}|dkrPtdn6d|kr^d}n(d	|krld
}nd|krzd}ntd| | j	
|ti  | j	| | j	| dS )zWrite the sequence (PRIVATE).

        Note that SeqXML requires the molecule type to contain the term
        "DNA", "RNA", or "protein".
        z8Sequence type is UnknownSeq but SeqXML requires sequencer   z,The sequence length should be greater than 0rH   Nzmolecule_type is not definedrG   r9   rI   r:   rJ   r;   zunknown molecule_type '%s')rn   rK   r   rp   bytesrd   r   r2   getrj   rq   r   rQ   rx   )r   r6   rK   rH   ZseqElemr   r   r   ru   P  s$    

zSeqXmlWriter._write_seqc             C   s~   |j dk	rzxn|j D ]d}t|ts(td|ddk r>td|dd\}}||d}| jdt	| | j
d qW dS )z.Write all database cross references (PRIVATE).Nz(dbxrefs should be of type list of string:r	   z9dbxrefs should be in the form ['source:id', 'source:id' ])r   r/   r<   )rL   rn   ro   rp   findr   splitrj   rq   r   rx   )r   r6   rN   ZdbsourceZdbidr|   r   r   r   rv   n  s    


zSeqXmlWriter._write_dbxrefsc             C   s   x|j  D ]\}}|dkr|dkrLd|i}| jdt| | jd qt|trx|D ]B}|dkrrd|i}n|t|d}| jdt| | jd q\W qt|t	t
tfr|t|d}| jdt| | jd qW dS )ztWrite all annotations that are key value pairs with values of a primitive type or list of primitive types (PRIVATE).)r0   r1   r   Nr%   r=   )r%   r)   )r2   r    rj   rq   r   rx   rn   r{   ro   r!   float)r   r6   r(   r)   r|   vr   r   r   rw     s$    


zSeqXmlWriter._write_properties)NNNN)rR   rS   rT   rU   r   rr   ry   rz   rs   rt   ru   rv   rw   rV   r   r   )r   r   rh     s    rh   N)rU   Zxmlr   Zxml.saxr   Zxml.sax.saxutilsr   Zxml.sax.xmlreaderr   ZBio.Seqr   r   ZBio.SeqRecordr   Z
Interfacesr
   r   r   rW   rh   r   r   r   r   <module>   s     aM