B
    d                 @   s   d dl mZmZmZ dddddddd	d
ddddddddddddddddZddddddddZdddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2Zdd*dddd)d!d"d$d#dd(d&d+d,d/d0d.d%d3Zd4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGZdMdKdLZ	dHS )N    )absolute_importdivisionprint_functionARNDCQEGHILKMFPOUSTWYVX)ALAARGASNASPCYSGLNGLUGLYHISILELEULYSMETMSEPHEPROPYLSECSERTHRTRPTYRVALUNK)ZCSOZLLPZMLYZPTRSEPZTPOZTYSr   r    r   r"   r*   r#   r$   r%   r'   r&   r(   r   r,   r+   r!   r   r.   r/   r-   r2   r0   r1   r3   )r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r   r   r   r   )DALDARDASDCYDGLDGNDHIDILDLEDLYDPNDPRDSGDSNDTHDTRDTYDVAMEDr5   r6   rA   r7   r8   r:   r9   r;   r<   r=   r>   rG   r?   r@   rB   rC   rD   rE   rF   )r   r   r   r   r    r!   r"   r$   r%   r&   r'   r(   r*   r+   r.   r/   r0   r1   r2   NTFc             C   s   t  }|r:|t t }|s:|t ddddddg}|r|t ddd	d
dg}|s|t ddddddddddddg}t |  |}|S )a6  

  =============================================================================
  Function for checking if a sequence conforms to the FASTA format.
  (http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml)

  Parameters:
  -----------
  sequence - str (None) - the sequence to be checked
  protein - bool (True) - check for protein letters if True
  strict_protein - bool (True) - only check for the 20 amino acids if True
  nucleic_acid - bool (False) - check for nucleic acid letters if True
  strict_nucleic_acid - bool (True) - only check for the 5 base pairs if True

  Return:
  -------
  The function returns a set of unknown letters. If no unknown letters are
  found, the set is of size 0.

  Notes:
  ------
  There is overlap between the letters used to represent amino aicds and the
  letters used to represent nucleic acids. So if both protein and nucleic_acid
  are set to True, the set of valid letters is the union of both. This will
  make the overall validation less strict.

  =============================================================================

  Br   Zr   *-r   r   r	   r   r   r   r   r   r   r   r   r   r   r   )setunionthree_letter_given_one_letterkeysupper
difference)sequenceproteinZstrict_proteinZnucleic_acidZstrict_nucleic_acidZfasta_formatZunknown_letters rT   y/mnt/filia/a/genomebrowser/www/genomebrowser/fleming/tools/molprobity/modules/cctbx_project/iotbx/pdb/amino_acid_codes.pyvalidate_sequenceo   s    "rV   )NTTFT)

__future__r   r   r   one_letter_given_three_letterZ)one_letter_given_three_letter_modified_aarN   #three_letter_l_given_three_letter_dZ#three_letter_d_given_three_letter_lrV   rT   rT   rT   rU   <module>   s     