B
    b                 @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddlm
Z
 G dd	 d	e	ZG d
d de
ZedkrddlmZ edd dS )a  Bio.SeqIO support for the UCSC nib file format.

Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:

    - ``0`` - T
    - ``1`` - C
    - ``2`` - A
    - ``3`` - G
    - ``4`` - N (unknown)

As the first bit in a nibble is set if the nucleotide is soft-masked, we
additionally have:

    - ``8`` - t
    - ``9`` - c
    - ``a`` - a
    - ``b`` - g
    - ``c`` - n (unknown)

A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
    >>> print("%i %s..." % (len(record), record.seq[:20]))
    50 nAGAAGagccgcNGgCActt...

For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
    N)Seq)	SeqRecord   )SequenceIterator)SequenceWriterc                   s0   e Zd ZdZ fddZdd Zdd Z  ZS )NibIteratorzParser for nib files.c                s   t  j|ddd dS )a  Iterate over a nib file and yield a SeqRecord.

            - source - a file-like object or a path to a file in the nib file
              format as defined by UCSC; the file must be opened in binary mode.

        Note that a nib file always contains only one sequence record.
        The sequence of the resulting SeqRecord object should match the sequence
        generated by Jim Kent's nibFrag utility run with the -masked option.

        This function is used internally via the Bio.SeqIO functions:

        >>> from Bio import SeqIO
        >>> record = SeqIO.read("Nib/test_even_bigendian.nib", "nib")
        >>> print("%s %i" % (record.seq, len(record)))
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        You can also call it directly:

        >>> with open("Nib/test_even_bigendian.nib", "rb") as handle:
        ...     for record in NibIterator(handle):
        ...         print("%s %i" % (record.seq, len(record)))
        ...
        nAGAAGagccgcNGgCActtGAnTAtCGTCgcCacCaGncGncTtGNtGG 50

        bZNib)modeZfmtN)super__init__)selfsource)	__class__ .lib/python3.7/site-packages/Bio/SeqIO/NibIO.pyr   7   s    zNibIterator.__init__c             C   sR   | d}|std| }|dkr,d}n|dkr:d}ntd| ||}|S )z9Start parsing the file, and return a SeqRecord generator.   zEmpty file.3a3de96blittle6be93d3abigz"unexpected signature in nib header)read
ValueErrorhexiterate)r   handleZword	signature	byteorderrecordsr   r   r   parseS   s    
zNibIterator.parsec             c   s   | d}t||}|  }t|}|d dkrJt||krztdn0|d dkrzt||d krntd|d| }t|dstdt	
dd	}||}t|}	t|	}
|
V  dS )
z)Iterate over the records in the nib file.r      r   zUnexpected file sizer   Ns
   0123489abcz&Unexpected sequence data found in files
   TCAGNtcagn)r   int
from_bytesbinasciiZhexlifylenr   setissubsetbytes	maketrans	translater   r   )r   r   r   Znumberlengthdataindicestablenucleotidessequencerecordr   r   r   r   b   s$    



zNibIterator.iterate)__name__
__module____qualname____doc__r   r   r   __classcell__r   r   )r   r   r   4   s   r   c                   s@   e Zd ZdZ fddZ fddZdd Z fdd	Z  ZS )
	NibWriterzNib file writer.c                s   t  j|dd dS )zInitialize a Nib writer object.

        Arguments:
         - target - output stream opened in binary mode, or a path to a file

        wb)r	   N)r
   r   )r   target)r   r   r   r   {   s    zNibWriter.__init__c                sR   t    | j}tj}|dkr$d}n|dkr2d}ntd| |t| dS )zWrite the file header.r   r   r   r   zunexpected system byte order %sN)	r
   write_headerr   sysr   RuntimeErrorwriter&   fromhex)r   r   r   r   )r   r   r   r8      s    
zNibWriter.write_headerc       
      C   s   | j }|j}t|}t|}|td| tdd}|d }|d }||7 }t|	dsht
d||}	|t|	 dS )	z)Write a single record to the output file.is
   TCAGNtcagns
   0123489abcr      Ts
   ACGTNacgtnz0Sequence should contain A,C,G,T,N,a,c,g,t,n onlyN)r   seqr&   r#   r;   structZpackr'   r$   r%   r   r(   r"   Z	unhexlify)
r   r/   r   r.   r-   r)   r,   Zpaddingsuffixr+   r   r   r   write_record   s    
zNibWriter.write_recordc                s   t  j|ddd}|S )zKWrite the complete file with the records, and return the number of records.r   )ZmincountZmaxcount)r
   
write_file)r   r   count)r   r   r   rC      s    zNibWriter.write_file)	r0   r1   r2   r3   r   r8   rB   rC   r4   r   r   )r   r   r5   x   s
   	r5   __main__)run_doctest)verbose)r3   r"   r@   r9   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   r   r   r5   r0   Z
Bio._utilsrF   r   r   r   r   <module>(   s   D/