B
    3Rc@                 @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dZeeZed	Ze
jjd
e
jjde
jjdiZdZG dd dejZejG dd dejZejfeedddZ d7e
j!j"ee ee e#edddZ$dd Z%d8ddZ&dd Z'dd  Z(d!d" Z)d9d$d%Z*d:d'd(Z+d)d* Z,d+d, Z-d-d. Z.d;d/d0Z/G d1d2 d2ejZ0d3d4 Z1d5d6 Z2dS )<a8  
Generate a unique hash code for a molecule based on chemistry. If two
molecules are chemically "the same", they should have the same hash.

Using molhash adds value beyond using SMILES because it:

* Ignores SMILES features that are not chemically meaningful
(e.g. atom map numbers)
* Canonicalizes enhanced stereochemistry groups. For example
`C[C@H](O)CC |&1:1|` and `C[C@@H](O)CC |&1:1|` have the same
molhash
* Canonicalizes S group data (for example, polymer data)

There are two hash schemes, the default, and one in which
tautomers are considered equivalent.

    N)Iterable)Optional)Chem)EnumerateStereoisomers)	rdMolHashZmolAtomMapNumberz((?:a|[&o]\d+):\d+(?:,\d+)*)i  i  i  Z_0_0c               @   sH   e Zd ZdZe Ze Ze Ze Z	e Z
e Ze ZdS )	HashLayera  
    :cvar CANONICAL_SMILES: RDKit canonical SMILES (excluding enhanced stereo)
    :cvar ESCAPE: arbitrary other information to be incorporated
    :cvar FORMULA: a simple molecular formula for the molecule
    :cvar NO_STEREO_SMILES: RDKit canonical SMILES with all stereo removed
    :cvar NO_STEREO_TAUTOMER_HASH: the above tautomer hash lacking all stereo
    :cvar SGROUP_DATA: canonicalization of all SGroups data present
    :cvar TAUTOMER_HASH: SMILES-like representation for a generic tautomer form

    N)__name__
__module____qualname____doc__enumautoCANONICAL_SMILESESCAPEFORMULANO_STEREO_SMILESNO_STEREO_TAUTOMER_HASHSGROUP_DATATAUTOMER_HASH r   r   :lib/python3.7/site-packages/rdkit/Chem/RegistrationHash.pyr   3   s   
r   c               @   sH   e Zd ZdZeeZejejej	ej
ejfZejejej
ejejfZdS )
HashSchemea  
    Which hash layers to use to when deduplicating molecules

    Typically the "ALL_LAYERS" scheme is used, but some users may want
    the "TAUTOMER_INSENSITIVE_LAYERS" scheme.

    :cvar ALL_LAYERS: most strict hash scheme utilizing all layers
    :cvar STEREO_INSENSITIVE_LAYERS: excludes stereo sensitive layers
    :cvar TAUTOMER_INSENSITIVE_LAYERS: excludes tautomer sensitive layers
    N)r   r	   r
   r   tupler   
ALL_LAYERSr   r   r   r   r   ZSTEREO_INSENSITIVE_LAYERSr   ZTAUTOMER_INSENSITIVE_LAYERSr   r   r   r   r   G   s   r   )hash_schemereturnc             C   s2   t  }x |jD ]}|| |   qW | S )a
  
    Generate a molecular hash using a specified set of layers.

    :param mol: the molecule to generate the hash for
    :param hash_scheme: enum encoding information layers for the hash
    :return: hash for the given scheme constructed from the input layers
    )hashlibZsha1valueupdateencodeZ	hexdigest)Z
all_layersr   hZlayerr   r   r   
GetMolHashd   s    	r!   )original_moleculedata_field_namesescaper   c             C   s   t | dd}t| t|tjj}t|\}}t|}t|}t	||d}	t
|\}
}tj|tj|phdtj|tj|tj|
tj|	tj|iS )a  
    Generate layers of data about that could be used to identify a molecule

    :param original_molecule: molecule to obtain canonicalization layers from
    :param data_field_names: optional sequence of names of SGroup DAT fields which
       will be included in the hash.
    :param escape: optional field which can contain arbitrary information
    :return: dictionary of HashLayer enum to calculated hash
    T)preserve_stereogenic_hs)dataFieldNames )_RemoveUnnecessaryHs_StripAtomMapLabelsr   MolHashHashFunctionZ
MolFormula_CanonicalizeStereoGroupsGetStereoTautomerHashGetCanonicalSmiles_CanonicalizeSGroupsGetNoStereoLayersr   r   r   r   r   r   r   r   )r"   r#   r$   molZformulacxsmilesZcanonical_molZtautomer_hashcanonical_smilesZsgroup_datano_stereo_tautomer_hashno_stereo_smilesr   r   r   GetMolLayerss   s    
r6   c             C   s    x|   D ]}|t q
W d S )N)GetAtomsZ	ClearPropATOM_PROP_MAP_NUMBER)r1   atr   r   r   r)      s    r)   Fc             C   s6   t  }d|_| |_t jj| |dd}|d |S )zq
    removes hydrogens that are not necessary for the registration hash, and
    preserves hydrogen isotopes
    TF)Zsanitize)r   ZRemoveHsParametersZupdateExplicitCountZremoveDefiningBondStereordmolopsZRemoveHsUpdatePropertyCache)Zrdk_molr%   Zremove_hs_paramsZ
edited_molr   r   r   r(      s    
r(   c             C   sD   |   dkrtS t| }|d d}t|tjj|}t|}|S )Nr   FT)	GetNumAtomsEMPTY_MOL_TAUTOMER_HASHr(   r;   r   r*   r+   HetAtomTautomerr.   )ZmoleculeZno_h_molZuseCxSmilesZhash_with_cxExtensionsr3   r   r   r   r-      s    
r-   c             C   s   dd |  dD }dd |D }|s*dS t|dkr>td|d	 }d
}t|dkr~t|d }|r~ddt| d}|r|d | S |S )Nc             s   s   | ]}|  V  qd S )N)strip).0pr   r   r   	<genexpr>   s    z%GetCanonicalSmiles.<locals>.<genexpr>|c             S   s   g | ]}|r|qS r   r   )r@   rA   r   r   r   
<listcomp>   s    z&GetCanonicalSmiles.<locals>.<listcomp>)r'   r'      z4Unexpected number of fragments in canonical CXSMILESr   r'      , )splitlen
ValueErrorENHANCED_STEREO_GROUP_REGEXfindalljoinsorted)r2   Zsmiles_partsr3   Zstereo_groupsgroupsr   r   r   r.      s    r.   c             C   sF   t | }|d tj| t|tjj}t|tjj	}||fS )NF)
r(   r;   r   r:   ZRemoveStereochemistryr   r*   r+   r>   ZCanonicalSmiles)r1   Zno_stereo_molr4   r5   r   r   r   r0      s    
r0   Tc             C   s   |   dkrg g fS |s(tt| }n\t| }| d}dd |dd dD }dgt| }xt|D ]\}}|||< qpW g }xH| 	 D ]<}	||	
  }
||	  }|
|kr||
 }
}||
|f qW ||fS )a  
    returns a 2-tuple with:

    1. the canonical ranks of a molecule's atoms
    2. the bonds expressed as (canonical_atom_rank_1,canonical_atom_rank_2) where
       canonical_atom_rank_1 < canonical_atom_rank_2

    If useSmilesOrdering is True then the atom indices here correspond to the order of
    the atoms in the canonical SMILES, otherwise just the canonical atom order is used.
    useSmilesOrdering=True is a bit slower, but it allows the output to be linked to the
    canonical SMILES, which can be useful.

    r   Z_smilesAtomOutputOrderc             S   s   g | ]}|rt |qS r   )int)r@   xr   r   r   rD      s    z2_GetCanonicalAtomRanksAndBonds.<locals>.<listcomp>rF   rG   )r<   listr   ZCanonicalRankAtomsZMolToSmilesGetProprI   rJ   	enumerateGetBondsGetBeginAtomIdxGetEndAtomIdxappend)r1   ZuseSmilesOrderingatRanksZsmiZordertxtZsmiOrderiidxbndOrderZbndZboZeor   r   r   _GetCanonicalAtomRanksAndBonds   s$    


r_   Atropc       
         s   |  ddks| dsdS |  d}||kr2dS | d}t|dkrPtd|d }t fd	d
|  D }|rtt|}tfdd
|  D }|rtt|}t	||||d}	|	S )z
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will
    be sorted. This assumes that the order of the atoms in that list is not
    important

    TYPEDATZ	FIELDNAMENZ
DATAFIELDSrF   z9cannot canonicalize data groups with multiple data fieldsr   c             3   s   | ]} | V  qd S )Nr   )r@   rR   )r[   r   r   rB     s    z*_CanonicalizeDataSGroup.<locals>.<genexpr>c             3   s   | ]} | V  qd S )Nr   )r@   rR   )r^   r   r   rB     s    )	fieldNameZatombondsr   )
rU   ZHasPropZGetStringVectProprJ   rK   r   r7   rO   rW   dict)
sgr[   r^   Z
fieldNamessortAtomAndBondOrderrd   dataatsbndsresr   )r[   r^   r   _CanonicalizeDataSGroup  s"    

rm   c             C   sJ   |   }|  }|| || ks8|| || krB||krB|| }}||fS )N)rX   rY   )ZbondZ	atomRanksZaid1Zaid2r   r   r   _GetCanononicalBondRep  s
    (
rn   c          
      s  | ddkrdS t fdd| D }tfdd| D }|rbtt|}tt|}| }td|||dd|d	d
|dd
d}d|kr|d |d< d|krt fdd|d D }	|rtt|	}	|	|d< d|krt fdd|d D }
t|
d r"t	d|rg }xRt
dt|
dD ]>}|
| |
|d   }}||krl|| }}|||f q>W tt|}
|
|d< |S )z
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important

    rb   SRUNc             3   s   | ]} | V  qd S )Nr   )r@   rR   )r[   r   r   rB   /  s    z)_CanonicalizeSRUSGroup.<locals>.<genexpr>c             3   s   | ]} | V  qd S )Nr   )r@   rR   )r^   r   r   rB   0  s    indexr   ZCONNECTr'   ZLABEL)typeatomsre   rp   ZconnectZlabelPARENTZXBHEADc             3   s   | ]}t | V  qd S )N)rn   GetBondWithIdx)r@   rR   )r[   r1   r   r   rB   A  s    ZXBCORRc             3   s   | ]}t | V  qd S )N)rn   rt   )r@   rR   )r[   r1   r   r   rB   G  s    rE   zXBCORR should have 2N bondsrF   )rU   r   r7   rW   rO   GetPropsAsDictrf   getrJ   rK   rangerZ   )r1   rg   r[   r^   rh   rj   rk   propsrl   ZxbhbondsZxbcorrbondsZtmpr\   Zb1Zb2r   )r[   r^   r1   r   _CanonicalizeSRUSGroup'  sL    




ry   c                s^   |  ddkrdS t fdd|  D }|r<tt|}|  }td||ddd}|S )	z
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important
    rb   COPNc             3   s   | ]} | V  qd S )Nr   )r@   rR   )r[   r   r   rB   ^  s    z(_CaonicalizeCOPSGroup.<locals>.<genexpr>rp   r   )rq   rr   rp   )rU   r   r7   rO   ru   rf   rv   )rg   r[   rh   rj   rx   rl   r   )r[   r   _CaonicalizeCOPSGroupW  s    r{   c             C   sJ  |pdg}t | \}}g }xt| D ]t}d}|ddkrNt|||||}n:|ddkrnt| ||||}n|ddkrt|||}|dk	r&|| q&W t|dkr@t	dd	 |D }t
d
d	 |D }i }	x8t|D ],\}
}d|kr|
d |	|d < |
d |d< qW x0t|D ]$\}
}d|kr|	|d  |d< qW t|S )z
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important
    ra   Nrb   rc   ro   rz   rF   c             s   s   | ]}t | V  qd S )N)r   items)r@   rR   r   r   r   rB   |  s    z'_CanonicalizeSGroups.<locals>.<genexpr>c             s   s   | ]}t |V  qd S )N)rf   )r@   rR   r   r   r   rB   }  s    rp   rs   )r_   r   ZGetMolSubstanceGroupsrU   rm   ry   r{   rZ   rJ   rO   r   rV   jsondumps)r1   r&   rh   r[   r^   rl   rg   ZlresZtresZidxmapr\   Zitmr   r   r   r/   g  s2    

r/   c               @   s   e Zd Ze Ze ZdS )EnhancedStereoUpdateModeN)r   r	   r
   r   r   ADD_WEIGHTSREMOVE_WEIGHTSr   r   r   r   r     s   r   c       
      C   s   |t jkrd}n|t jkr d}ntdd}|  }xv|D ]n}| }|t|  }xT| D ]H}| }	|t jkr|	dkrtd|	  d|	 |
|	|  d}q\W q:W | |fS )	NrF   rS   z*Invalid Enhanced Stereo weight update modeFi  zQEnhanced stereo group canonicalization does not support isotopes above 999. Atom z is T)r   r   r   rK   GetStereoGroupsZGetGroupTypeENHANCED_STEREO_GROUP_WEIGHTSr7   Z
GetIsotopeZGetIdxZ
SetIsotope)
r1   modeZfactorisotopesModifiedZstgsZstgZstgtZweightr9   Zisotoper   r   r   !_UpdateEnhancedStereoGroupWeights  s$    


r   c       	      C   s   t |  st| | fS t| } t|  t| tj\} }t	
 }d|_d|_d}d}x6t		| |D ]&}t|}|dks||k rh|}|}qhW td}|d|}|rt|tj\}}||fS )a|  
    Returns canonical CXSmiles and the corresponding molecule with the
    stereo groups canonicalized.

    The RDKit canonicalization code does not currently take stereo groups into
    account. We work around that by using EnumerateStereoisomers() to generate
    all possible instances of the molecule's stereogroups and then lexically
    compare the CXSMILES of those.
    TFNr'   z#\[[1-3]0*([1-9]?[0-9]*[A-Z][a-z]?)@z[\1@)rJ   r   r   ZMolToCXSmilesMolZFastFindRingsr   r   r   r   ZStereoEnumerationOptionsZonlyStereoGroupsuniquerecompilesubr   )	r1   r   ZoptsZ	resultMolZresultCXSmilesZisomerZcxSmilesZextraIsotopeRemovalRegex_r   r   r   r,     s,    



r,   )NN)F)T)r`   T)NT)3r   r   r   r}   Zloggingr   typingr   r   Zrdkitr   Z
rdkit.Chemr   r   r8   Z	getLoggerr   Zloggerr   rL   ZStereoGroupTypeZ
STEREO_ANDZ	STEREO_ORZSTEREO_ABSOLUTEr   r=   Enumr   r   r   r   strr!   Zrdchemr   setr6   r)   r(   r-   r.   r0   r_   rm   rn   ry   r{   r/   r   r   r,   r   r   r   r   <module>   sL   


 &"


$ 
0
$