B
    ‰°bòe  ã            i   @   s´  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	d
ddœZi Zdddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdAdAdAdBdBdBdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_œXed`< dadbdcdddedfdgœedh< didjdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~dd€dd‚dƒd„d…d†d‡dˆd‰dŠd‹dŒddŽddd‘d’d“d”d•d–d—d˜d™dšd›dœddždŸd d¡d¢d£d¤d¥d¦d§d¨d©dªd«d¬d­d®d¯d°d±dŽd²d³d´dµd¶d·d¸d¹dºd»d¼d½d¾d¿dÀdÁdÂdÃdÄdÅdÆdÇdÈdŽdÉdÊdËdÌdÍdÎdÏœhedÐ< dÑdÒiedÓ< dÔdÕdÖd×dØdÙdÚdÛdÜdÝdÞdßdàdádÙdâdÕdÕdÙdãœZdäZdåZg Zxe ¡ D ]Zee ¡ 7 ZqVW dódædç„ZG dèdé„ déeƒZdêdë„ Zdìdí„ Zdîdï„ Zdðdñ„ Zedòkr°dS )ôa  Bio.SeqIO parser for the ABI format.

ABI is the format used by Applied Biosystem's sequencing machines to store
sequencing results.

For more details on the format specification, visit:
http://www6.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf

é    N)Úbasename)ÚSeq)Ú	SeqRecordé   )ÚSequenceIteratorZsample_wellZdyeZpolymerZmachine_model)ÚTUBE1ÚDySN1ÚGTyp1ÚMODL1z(Sequencing Analysis parameters file namez$Analysis Protocol XML schema versionzAnalysis Protocol settings namez"Analysis Protocol settings versionzAnalysis Protocol XML stringzSample Commentz*Container Identifier, a.k.a. plate barcodezAContainer name, usually identical to CTID, but not necessarily sozComment Titlez`Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.zChannel 1 raw datazChannel 2 raw datazChannel 3 raw datazChannel 4 raw dataz=Short Array holding measured volts/10 (EP voltage) during runzDShort Array holding measured milliAmps trace (EP current) during runzIShort Array holding measured milliWatts trace (Laser EP Power) during runzTShort Array holding measured oven Temperature (polymer temperature) trace during runzChannel 9 processed datazChannel 10 processed datazChannel 11 processed datazChannel 12 processed datazDownsampling factorzDye set namezNumber of dyesz
Dye 1 namez
Dye 2 namez
Dye 3 namez
Dye 4 namezDye 1 wavelengthzDye 2 wavelengthzDye 3 wavelengthzDye 4 wavelengthz'Electrophoresis voltage setting (volts)zStart Run eventzStop Run eventzStart Collection eventzStop Collection eventzLBase Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"zGel or polymer TypezInjection time (seconds)zInjection voltage (volts)zLane/CapillaryzSample tracking IDzLength to detectorz!Laser Power setting (micro Watts)z!Instrument name and serial numberzData collection module filezModel numberzPixels averaged per lanezNumber of capillarieszAList of scans that are marked off scale in Collection. (optional)zýList of scan number indexes that have values greater than 32767 but did not saturate the camera. In Genemapper samples, this can have indexes with values greater than 32000. In sequencing samples, this cannot have indexes with values greater than 32000.z’List of color data values found at the locations listed in the OvrI tag. There must be exactly as many numbers in this array as in the OvrI array.z;Sequencing Analysis Mobility file name chosen in collectionzRun Module XML schema versionzRun Module name (same as MODF)zRun Module XML stringzRun Protocol namezRun Protocol versionzRun Started DatezRun Stopped DatezData Collection Started DatezData Collection Stopped datezRun Started TimezRun Stopped TimezData Collection Started TimezData Collection Stopped Timez&Scanning Rate. Milliseconds per frame.zRun NamezNumber of scanszPolymer lot expiration datezPolymer lot numberzSample namez Data collection software versionz Data collection firmware versionzyArray of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)z Rescaling divisor for color dataz#Number of scans (legacy - use SCAN)zWell IDzRun temperature settingz-Name of user who created the plate (optional))XZAPFN2ZAPXV1ZAPrN1ZAPrV1ZAPrX1ZCMNT1ÚCTID1ZCTNM1ZCTTL1ZCpEP1ZDATA1ZDATA2ZDATA3ZDATA4ZDATA5ZDATA6ZDATA7ZDATA8ZDATA9ZDATA10ZDATA11ZDATA12ZDSam1r   zDye#1ZDyeN1ZDyeN2ZDyeN3ZDyeN4ZDyeW1ZDyeW2ZDyeW3ZDyeW4ZEPVt1ZEVNT1ZEVNT2ZEVNT3ZEVNT4ZFWO_1r	   ZInSc1ZInVt1ZLANE1ÚLIMS1ZLNTD1ZLsrP1ZMCHN1ZMODF1r
   ZNAVG1ZNLNE1ZOfSc1ZOvrI1ZOvrI2ZOvrI3ZOvrI4ZOvrV1ZOvrV2ZOvrV3ZOvrV4ZPDMF1ZRMXV1ZRMdN1ZRMdX1ZRPrN1ZRPrV1ÚRUND1ÚRUND2ZRUND3ZRUND4ÚRUNT1ÚRUNT2ZRUNT3ZRUNT4ZRate1ÚRunN1ZSCAN1ZSMED1ZSMLt1ÚSMPL1ZSVER1ZSVER3ZSatd1ZScal1ZScan1r   ZTmpr1ZUser1ZgeneralzContainer ownerzInstrument ClasszInstrument FamilyzOfficial Instrument NamezInstrument ParameterszRun Module version)ÚCTOw1ÚHCFG1ÚHCFG2ÚHCFG3ÚHCFG4ZRMdVa1zabi_3130/3130xlz˜Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, false if system auditing was disabled.zŠAnode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.z&Anode buffer tray first installed datezAnode buffer lot numberzcNumber of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)zAnode buffer typez?Analysis Ending scan number for basecalling on initial analysisz<Analysis Ending scan number for basecalling on last analysiszAmplicon namez?Analysis Return code. Produced only by 5 Prime basecaller 1.0b3z:Flag to indicate whether adaptive processing worked or notz0Analysis Starting scan number for first analysisz/Analysis Starting scan number for last analysisz.Audit log used across 3500 software (optional)z%Assay validation flag (true or false)z&Record of ambient temperature readingszThe assay contents (xml format)zThe assay namezThe assay versionzHReference scan number for mobility and spacing curves for first analysiszGReference scan number for mobility and spacing curves for last analysisz@Basecaller timestamp. Time of completion of most recent analysiszBasecalling qc codez;Basecalling warnings, a concatenated comma separated stringz9Basecalling errors, a concatenated comma separated stringzCapillary array expirationzCapillary array lot numberznNumber of injections processed (including the one of which this sample was a part) through the capillary arrayzCapillary array serial numberzCathode buffer expiration datez(Cathode buffer tray first installed datezCathode buffer lot numberzeNumber of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)zCathode buffer typez%Start of the clear range (inclusive).zClear range lengthzContiguous read lengthz!One of "Pass", "Fail", or "Check"z=The name entered as the Owner of a plate, in the plate editorzFile checksumzrA list of door-close events, separated by semicolon. Door open events are generally paired with door close events.zzReserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.zqA list of door-open events, separated by semicolon. Door close events are generally paired with door open events.z5Electronic signature record used across 3500 softwarez9Feature table. Can be created by Nibbler for Clear Range.zDFeature table vocabulary. Can be created by Nibbler for Clear Range.z4Features. Can be created by Nibbler for Clear Range.zHThe Instrument Class. All upper case, no spaces. Initial valid value: CEzeThe Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)z‡The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.a  Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.zInjection namezParameter settings informationzªThe estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.zkAmplitude of primary peak, which is not necessarily equal to corresponding signal strength at that positionzODeviation of primary peak position from (PLoc,2), times 100, rounded to integerz¡Full-width Half-max of primary peak, times 100, rounded to integer. Corresponding signal intensity is not necessarily equal to one half of primary peak amplitudezmAmplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that positionzBase of secondary peakzQDeviation of secondary peak position from (PLoc,2), times 100, rounded to integerz+Array of sequence characters edited by userz4Array of sequence characters as called by Basecallerz1Array of quality Values (0-255) as edited by userz7Array of quality values (0-255) as called by BasecallerzFMobility file name chosen in most recent analysis (identical to PDMF1)z&Array of peak locations edited by userz/Array of peak locations as called by Basecallerz"SeqScape 2.0 project template namezSeqScape 2.0 project namez]Plate size. The number of sample positions in the container. Current allowed values: 96, 384.z6Plate type. Current allowed values: 96-Well, 384-Well.zMedian pupscorezQV20+ valuezQC parameterszTrimming and QC codez2QC warnings, a concatenated comma separated stringz0QC errors, a concatenated comma separated stringzƒThe name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.zpReinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.zRaman normalization factorz.for whether the sequence has been complementedz<Run name (which, for 3500, is different from injection name)zSignal strength for each dyezPolymer first installed datez^Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)z*Average peak spacing used in last analysisz2Basecaller name - corresponds to name of bcp file.z7Average peak spacing last calculated by the Basecaller.z!Sequencing Analysis Specimen NamezBasecaller version numberz!Sample File Format Version Stringz#The parameter string of size callerz8Raw data start point. Set to 0 for 3500 data collection.z Active spectral calibration namezTimming parameterszTrace score.zTrace peak aria ratiozFChemistry type ("term", "prim", "unknown"), based on DYE_1 informationz?Dye ("big", "d-rhod", "unknown"), based on mob file informationzMaximum Quality ValuezSet Trim regionzTrim probability)hZAAct1ZABED1ZABID1ZABLt1ZABRn1ZABTp1ZAEPt1ZAEPt2ZAPCN1ZARTN1ZASPF1ZASPt1ZASPt2ZAUDT2ZAVld1ZAmbT1ZAsyC1ZAsyN1ZAsyV1ZB1Pt1ZB1Pt2ZBCTS1ZBcRn1ZBcRs1ZBcRs2ZCAED1ZCALt1ZCARn1ZCASN1ZCBED1ZCBID1ZCBLt1ZCBRn1ZCBTp1ZCLRG1ZCLRG2ZCRLn1ZCRLn2r   ZCkSm1ZDCEv1ZDCHT1ZDOEv1ZESig2ZFTab1ZFVoc1ZFeat1r   r   r   r   ZInjN1ZLAST1ZNOIS1ZP1AM1ZP1RL1ZP1WD1ZP2AM1ZP2BA1ZP2RL1ÚPBAS1ÚPBAS2ZPCON1ÚPCON2ZPDMF2ZPLOC1ZPLOC2ZPRJT1ZPROJ4ZPSZE1ZPTYP1ZPuSc1ZQV201ZQV202ZQcPa1ZQcRn1ZQcRs1ZQcRs2ZRGOw1ZRInj1ZRNmF1ZRevC1r   zS/N%1ZSMID1ZSMRn1ZSPAC1ZSPAC2ZSPAC3ZSPEC1ZSVER2ZSVER4ZScPa1ZScSt1ZSpeN1ZTrPa1ZTrSc1ZTrSc2ZphAR1ZphCH1ZphDY1ZphQL1ZphTR1ZphTR2zabi_3530/3530xlZBufT1z*Buffer tray heater temperature (degrees C)zabi_3730/3730xlÚbÚsÚHÚhÚiZ2iÚfÚdZh2BZ4BZ2i2bÚBZ2hZ4hZ4i)r   é   é   é   é   é   é   é   é
   é   é   é   é   é   é   é   é   é   é   z	>H4sI2H3Iz>4sI2H4Ic             C   s:   | dkr|S y|   ¡ S  tk
r4   | j t ¡ dS X dS )zŒReturn the string value of the given an optional raw bytes tag value.

    If the bytes value is None, return the given default value.

    N)Úencoding)ÚdecodeÚUnicodeDecodeErrorÚsysÚgetdefaultencoding)Zopt_bytes_valueÚdefault© r;   ú.lib/python3.7/site-packages/Bio/SeqIO/AbiIO.pyÚ_get_string_tagM  s    r=   c                   s2   e Zd ZdZd	‡ fdd„	Zdd„ Zdd„ Z‡  ZS )
ÚAbiIteratorzParser for Abi files.Fc                s   || _ tƒ j|ddd dS )z+Return an iterator for the Abi file format.r   ZABI)ÚmodeÚfmtN)ÚtrimÚsuperÚ__init__)ÚselfÚsourcerA   )Ú	__class__r;   r<   rC   ^  s    zAbiIterator.__init__c             C   s8   |  d¡}|stdƒ‚|dkr*td| ƒ‚|  |¡}|S )z9Start parsing the file, and return a SeqRecord generator.r%   zEmpty file.s   ABIFzFile should start ABIF, not %r)ÚreadÚ
ValueErrorÚOSErrorÚiterate)rD   ÚhandleZmarkerZrecordsr;   r;   r<   Úparsec  s    

zAbiIterator.parsec             #   sD  dddddœ}t tt ¡ dgttƒ ƒƒ}t t| t 	t¡¡¡}d}i ‰ d }}x’t
||ƒD ]„\}}	}
|t|	ƒ }|
ˆ |< |dkrŽ|
 ¡ }q^|dkrªdd„ |
 ¡ D ƒ}q^|d	kr¼t|
ƒ}q^||krÎ|
||< q^|tkr^|
|t| < q^W d
|d |d f |d< d
|d |d f |d< ˆ |d< t‡ fdd„dD ƒƒ}|r¤yt|jƒ dd¡}W n tk
rj   d}Y nX tˆ  d¡|ƒ}tˆ  d¡dƒ}ttdƒ||||d}nHyt|jƒ dd¡}W n tk
rÔ   d}Y nX tt|ƒ||d|d}|rþ||jd< n|s|s| jrtdƒ‚| jr0|s0t|ƒ}d|jd< |V  dS )z.Parse the file and generate SeqRecord objects.Ú )r   r   r   r   Nz<unknown id>r   r   c             S   s   g | ]}t |ƒ‘qS r;   )Úord)Ú.0Úvalr;   r;   r<   ú
<listcomp>‹  s    z'AbiIterator.iterate.<locals>.<listcomp>r   z%s %sr   r   Z	run_startr   r   Z
run_finishZabif_rawc             3   s   | ]}|ˆ kV  qd S )Nr;   )rO   Ztn)Úrawr;   r<   ú	<genexpr>ž  s    z&AbiIterator.iterate.<locals>.<genexpr>)r   r   z.fsar   r   z<unknown description>)ÚidÚnameÚdescriptionÚannotationsz.ab1Úphred_qualityzGThe 'abi-trim' format can not be used for files without quality values.ZDNAZmolecule_type)ÚdictÚzipÚ_EXTRACTÚvaluesÚlenÚstructÚunpackÚ_HEADFMTrG   ÚcalcsizeÚ_abi_parse_headerÚstrr6   r=   Úallr   rU   ÚreplaceÚAttributeErrorÚgetr   r   Úletter_annotationsrA   rH   Ú	_abi_trimrW   )rD   rK   ÚtimesZannotÚheaderZ	sample_idÚseqÚqualÚtag_nameÚ
tag_numberZtag_dataÚkeyZis_fsa_fileÚ	file_namerV   Úrecordr;   )rR   r<   rJ   p  sn    






zAbiIterator.iterate)F)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rC   rL   rJ   Ú__classcell__r;   r;   )rF   r<   r>   [  s   r>   c             C   s   t | ddS )z[Return an iterator for the Abi file format that yields trimmed SeqRecord objects (PRIVATE).T)rA   )r>   )rK   r;   r;   r<   Ú_AbiTrimIteratorÍ  s    rx   c             c   sú   | d }| d }| d }d}xØ||k rô|||  }|  |¡ t t| t t¡¡¡|f }|d7 }|d  ¡ }|t|d ƒ7 }|d  ¡ }	|d }
|d }|d }|d }|d }|d }|dkrÊ|d	 }|  |¡ | |¡}|	|
t|||ƒfV  qW d
S )z$Return directory contents (PRIVATE).r%   r&   r(   r   r   r#   r'   r)   r4   N)	Úseekr^   r_   Ú_DIRFMTrG   ra   r6   rc   Ú_parse_tag_data)rk   rK   Zhead_elem_sizeZhead_elem_numZhead_offsetÚindexÚstartZ	dir_entryrp   rn   ro   Ú	elem_codeÚelem_numZ	data_sizeZdata_offsetZ
tag_offsetÚdatar;   r;   r<   rb   Ò  s0    



rb   c       	         s®   d}d}d}d‰ t | ƒ|kr | S ‡ fdd„| jd D ƒ}dg}xPtdt |ƒƒD ]>}|d	 ||  }|dk rv| d¡ qN| |¡ |sN|}d
}qNW | t|ƒ¡}| ||… S dS )a   Trims the sequence using Richard Mott's modified trimming algorithm (PRIVATE).

    Arguments:
        - seq_record - SeqRecord object to be trimmed.

    Trimmed bases are determined from their segment score, which is a
    cumulative sum of each base's score. Base scores are calculated from
    their quality values.

    More about the trimming algorithm:
    http://www.phrap.org/phredphrap/phred.html
    http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Quality_trimming.html
    Fr4   r   gš™™™™™©?c                s   g | ]}ˆ d |d   ‘qS )r*   g      $Àr;   )rO   rm   )Úcutoffr;   r<   rQ     s   z_abi_trim.<locals>.<listcomp>rX   r   éÿÿÿÿTN)r]   rh   ÚrangeÚappendr|   Úmax)	Z
seq_recordr}   ZsegmentZ
trim_startZ
score_listZcummul_scorer   ZscoreZtrim_finishr;   )r   r<   ri   ú  s&    

ri   c             C   sò   | t krê|dkrd}nt|ƒ}d| t |   }t|ƒt |¡ksDt‚t ||¡}| dkrlt|ƒdkrl|d }| dkrx|S | dkrŽttj|Ž ƒS | dkr¬ttj	|d	d
… Ž ƒS | dkr¼t
|ƒS | dkrÐ|dd	… S | dkrä|d	d… S |S nd	S d	S )zÍReturn single data value (PRIVATE).

    Arguments:
     - elem_code - What kind of data
     - elem_num - How many data points
     - raw_data - abi file object from which the tags would be unpacked

    r   rM   ú>)r*   r+   r   r#   r*   r+   Nr$   r-   r2   r3   r‚   )Ú_BYTEFMTrc   r]   r^   ra   ÚAssertionErrorr_   ÚdatetimeZdateZtimeÚbool)r~   r   Zraw_dataZnumr@   r€   r;   r;   r<   r{   -  s.    	r{   Ú__main__)N)rv   r‰   r^   r8   Úos.pathr   ZBio.Seqr   ZBio.SeqRecordr   Z
Interfacesr   r[   Z_INSTRUMENT_SPECIFIC_TAGSr‡   r`   rz   Z__global_tag_listingr\   ÚtagÚkeysr=   r>   rx   rb   ri   r{   rs   r;   r;   r;   r<   Ú<module>   sæ  

r(3,
