B
    ž3RcÁ7  ã               @   sF  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ ejjjdejjjd	ejjjd
ejjjdiZdddddœZdZdZd7dd„Zdd„ Zd8dd„Zdd„ Zdd„ Zd d!„ Zd"d#„ Zd$d%„ Zd&d'„ Zd9d(d)„Z d*d+„ Z!d,d-„ Z"d.d/„ Z#d0d1„ Z$d2d3„ Z%G d4d5„ d5ej&ƒZ'e(d6krBe%ƒ  dS ):é    N)Úlinear_sum_assignment)ÚChem)ÚAllChem)Úrdmolops)ÚDataStructs)ÚFingerprintMolsé   é   é   é   ú-ú=ú#ú:)r   r	   r
   r   éß   é   éÿÿÿÿTc             C   s¶   g }xH|   ¡ D ]<}|dks&| ¡ |krg }t| ¡ gƒ}t||||||ƒ qW |r®g }	tƒ }
xL|D ]D}||
krbtdd„ |ddd… D ƒƒ}||
krb|	 |¡ |
 |¡ qbW |	S |S dS )zŒthis function returns the same set of bond paths as the Gobbi paper.  These differ a little from the rdkit FindAllPathsOfLengthMToN functionr   c             S   s   g | ]}|‘qS © r   )Ú.0Úir   r   ú@share/RDKit/Contrib/AtomAtomSimilarity/AtomAtomPathSimilarity.pyú
<listcomp>1   s    z2FindAllPathsOfLengthMToN_Gobbi.<locals>.<listcomp>N)ÚGetAtomsÚGetIdxÚsetÚ_FindAllPathsOfLengthMToN_GobbiÚtupleÚappendÚadd)ÚmolÚ	minlengthÚ	maxlengthÚrootedAtAtomÚuniquepathsÚpathsÚatomÚpathÚvisitedZuniquepathlistÚseenZreversepathr   r   r   ÚFindAllPathsOfLengthMToN_Gobbi"   s"    

r)   c             C   sÎ   xÈ|   ¡ D ]¼}| ¡ |kr
| ¡ }| |¡ t|ƒ|krRt|ƒ|krR| t|ƒ¡ t|ƒ|k r¾| ¡ }| ¡ }	| ¡ |  ¡ kr„|	}
n|}
|
 ¡ }||kr¾| |¡ t|
|||||ƒ | 	|¡ | 
¡  q
W d S )N)ÚGetBondsr   r   Úlenr   ÚGetBeginAtomÚ
GetEndAtomr   r   ÚremoveÚpop)r%   r&   r    r!   r'   r$   ZbondZbidxÚa1Úa2ZnextatomZnextatomidxr   r   r   r   :   s$    


r   é   c          	   C   sò  i }x2|   ¡ D ]&}t| ¡  | ¡ | ¡ f|| ¡ < qW i }x’|  ¡ D ]„}| ¡ }g ||< xltt| d||ddƒD ]P\}}g }	|}
g }xºt|ƒD ]®\}}|| \}}}|	 	t
| ¡ | ¡ |
krÎ|}n|}| ¡ }| ¡ rê|d7 }| ¡ |krúd}|dk	r0| ¡ }| ¡ r&|	 	| ¡ ¡ n
|	 	|¡ | 	||f¡ | ¡ }
q˜W t d¡}xft|ƒD ]Z\}\}}|t |¡ }|t t¡ }|dk	r®|t |¡ }|t t¡ }n|}|}q^W ||  	|¡ qxW qHW x| ¡ D ]}| ¡  qÜW |S )z±returns a list of integers describing the paths for molecule m1.  This uses numpy 16 bit unsigned integers to reproduce the data in the Gobbi paper.  The returned list is sortedr   F)r"   r#   él   Nr   )r*   Ú_BK_ZGetBondTyper,   r-   r   r   Ú	enumerater)   r   Ú_BONDSYMBOL_ÚGetAtomicNumÚGetIsAromaticZ	GetSymbolÚlowerÚnumpyZushortÚ_nAT_Ú_nBT_ÚvaluesÚsort)Úm1Z
uptolengthZbondtypelookupÚbZpathintegersÚaÚidxZipathr&   ZstrpathZ
currentidxÚresZipÚpZbkr0   r1   ZakZastrZpathuniqueintZiresZbiÚaiZval1Zval2Zval3Zval4r   r   r   ÚgetpathintegersP   sV    &





rF   c       	      C   sr   d}d}d}x`||k rl||k rl| | }|| }||k r@|d7 }q||krR|d7 }q|d7 }|d7 }|d7 }qW |S )zdreturns the number of items sorted lists l1 and l2 have in common.  ll1 and ll2 are the list lengthsr   r   r   )	Úl1Zll1Úl2Zll2ZncommonZix1Zix2r0   r1   r   r   r   Ú	getcommon‰   s    

rI   c             C   s4   t | |||ƒ}t|d ƒt||ƒd | d  }|S )z<returns the similarity of two sorted path lists.  Equation 2r   r	   )rI   ÚfloatÚmax)ÚaipathsÚbjpathsÚnaipathsÚnbjpathsZncÚsimr   r   r   Ú
getsimaibj   s    "rQ   c       
      C   sÂ   t  | j¡|  }t j|dgdgd}g }x(|D ] }| ||jd |jd f¡ q.W | ¡  tƒ }tƒ }g }xD|D ]<\}}}	||krp|	|krp| |¡ | |	¡ | ||	f¡ qpW |dt	| jƒ… S )zjreturn a mapping of the atoms in the similarity matix using the heuristic algorithm described in the paperÚmulti_indexZ	writeonly)ÚflagsZop_flagsr   r   N)
r:   ÚonesÚshapeZnditerr   rR   r>   r   r   Úmin)
ÚsimmatrixarrayÚ	costarrayÚitZdsurA   ZseenaZseenbÚmappingsrP   r@   r   r   r   Úgetmappings¤   s    
 

r[   c             C   s*   t  | j¡|  }t|ƒ\}}t||ƒ}|S )z’return a mapping of the atoms in the similarity matrix - the Hungarian algorithm is used because it is invariant to atom ordering.  Requires scipy)r:   rT   rU   r   Úzip)rW   rX   Zrow_indZcol_indrC   r   r   r   Úgethungarianmappings»   s    
r]   c             C   sJ   |j \}}d}x | D ]\}}||| | 7 }qW |t||ƒd |  }|S )z6return the similarity for a set of mapping.  See Eqn 3g        r	   )rU   rK   )rZ   ZsimmatrixdictZnaaZnabZscorerA   r@   Úsimabr   r   r   ÚgetsimabÃ   s    
r_   c             C   s°   dd„ |   ¡ D ƒ}dd„ |  ¡ D ƒ}t t|ƒt|ƒf¡}xpt|ƒD ]d\}\}}	||	 }
t|
ƒ}xFt|ƒD ]:\}\}}||krj|| }t|ƒ}t|
|||ƒ|| |< qjW qDW |S )z:generate a matrix of atom atom similarities.  See Figure 4c             S   s$   g | ]}|  ¡ | ¡ f| ¡ f‘qS r   )r7   r8   r   )r   rE   r   r   r   r   Ñ   s    z getsimmatrix.<locals>.<listcomp>c             S   s$   g | ]}|  ¡ | ¡ f| ¡ f‘qS r   )r7   r8   r   )r   Úbjr   r   r   r   Ò   s    )r   r:   Zzerosr+   r5   rQ   )r?   Úm1pathintegersÚm2Úm2pathintegersZaidataZbjdatarW   rE   ZaitypeZaiidxrL   rN   r`   ZbjtypeZbjidxrM   rO   r   r   r   ÚgetsimmatrixÎ   s    rd   c             C   sD   |dkrt | ƒ}|dkr t |ƒ}t| |||ƒ}t|ƒ}t||ƒ}|S )z÷compute the Atom Atom Path Similarity for a pair of RDKit molecules.  See Gobbi et al, J. ChemInf (2015) 7:11
      the most expensive part of the calculation is computing the path integers - we can precompute these and pass them in as an argumentN)rF   rd   r]   r_   )r?   rb   ra   rc   Z	simmatrixrZ   r^   r   r   r   ÚAtomAtomPathSimilarityá   s    
re   c              C   s   t  d¡} t  d¡}t| |ƒS )z2reproduce the worked similarity in the Gobbi paperZo1nccc1Cz
[nH]1nccc1)r   ÚMolFromSmilesre   )r?   rb   r   r   r   Útest0ó   s    

rg   c              C   s<   g } dddg}x(|D ] }t  |¡}t|ƒ}|  |¡ qW | S )zcgenerate a set of path integers for 3 molecules from the Gobbi source IAAPathGeneratorCharTest.javaÚCzC(=O)FZC1ON1)r   rf   rF   r   )rC   ÚsmilesÚsÚmZmpathintegersr   r   r   Útest1ú   s    


rl   c           
   C   sd   ddddddddd	d
g
} g }xB| D ]:}x4| D ],}t  |¡}t  |¡}| dt||ƒ ¡ q,W q"W |S )zLgenerate a matrix molecules from the Gobbi source AAPathComparator2Test.javaÚ*rh   ÚNZCCOzCC(=O)NZc1ccccc1Zc1ncncc1z
c1[nH]ccc1zc1ncncc1CC(=O)NZc1ccccc1c1ncncc1z%.4f)r   rf   r   re   )Z
smileslistZsimsÚs1Ús2r?   rb   r   r   r   Útest2  s    



rq   c              C   st   t  d¡} t  d¡}t  d¡}t  d¡}g }xB| |||fD ]2}x,| |||fD ]}t||ƒ}| d| ¡ qLW q:W |S )z|generate a set of similarities for the example compounds in Figure 1.  These are compared to the values in Additional File 1z/Clc1ccc(CN2CCC(CC2)c3cc([nH]n3)c4ccc(Cl)cc4)cc1z)Clc1ccc(CN2CCN(CC2)CC(=O)N(C)c3ccccc3)cc1z(Cc1cccn2cc(nc12)c3ccc(NC(=O)CN4CCCC4)cc3z&Cc1c(cc2ccccn12)c3ccc(OCCCN4CCCCC4)cc3z%.2f)r   rf   re   r   )Zm1aZm1bZm2aZm2brC   r?   rb   rP   r   r   r   Útest3  s    




rr   c              C   sš   d} dd„ |   ¡ D ƒ}t|ƒ}t|ƒ}dd„ |D ƒ}t ¡ }x>t||ƒD ]0\}}x&t||ƒD ]\}}	t||||	d}
q\W qHW td||t ¡ | f ƒ d S )Na  C[C@@H](O)[C@@H]1OCC[C@@H](C)[C@H](O)C(=O)OC[C@]23CCC(C)=C[C@H]2O[C@@H]4C[C@@H](OC(=O)C=CC=C1)[C@@]3(C)[C@@]45CO5
CC1=C[C@H]2O[C@@H]3C[C@H]4OC(=O)C=CC=CC(=O)OCCC(C)=CC(=O)OC[C@@]2(CC1)[C@]4(C)[C@]35CO5
CC1=CC(=O)OC[C@]23C[C@H](O)C(C)=C[C@H]2O[C@@H]4C[C@@H](OC(=O)C=CC=CC(=O)OCC1)[C@@]3(C)[C@]45CO5
CC1(C)N=C(N)N=C(N)N1C2=CC=C(Br)C=C2
CC1(C)N=C(N)N=C(N)N1C2=CC=CC=C2
CC1(C)N=C(N)N=C(N)N1C2=CC=C(I)C=C2
CC1=CC=C(C=C1)N2C(N)=NC(N)=NC2(C)C
CC1(C)N=C(N)N=C(N)N1C2=CC=C(Cl)C=C2
CC1(C)N=C(N)N=C(N)N1C2=CC=C(F)C=C2
CC1=CC=CC(=C1)N2C(N)=NC(N)=NC2(C)C
COC1=CC=C(C=C1)N2C(N)=NC(N)=NC2(C)C
CC1=CC=C(N2C(N)=NC(N)=NC2(C)C)C(C)=C1
CCOC1=CC=C(C=C1)N2C(N)=NC(N)=NC2(C)C
COC1=CC=CC(=C1)N2C(N)=NC(N)=NC2(C)C
CC1=CC=CC(NC2=NC(N)=NC(C)(C)N2)=C1
CNC1=C(N(CC2=CC=C(Cl)C(Cl)=C2)C(C)=O)C(=O)C3=CC=CC=C3C1=O
CC(=O)N(CC1=CC=C(F)C=C1)C2=C(NCC3=CC=CC=C3)C(=O)C4=CC=CC=C4C2=O
CC(=O)N(CC1=CC=C(F)C=C1)C2=C(NCCC3=CC=CC=C3)C(=O)C4=CC=CC=C4C2=O
CCC(=O)N(C(C)C)C1=C(NC)C(=O)C2=CC=CC=C2C1=O
CCN(CC)CCCC(C)NC1=CC=NC2=CC(Cl)=CC=C12
CC(CCCNCCO)NC1=CC=NC2=CC(Cl)=CC=C12
CC(C)C(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
CC(C)(C)CC(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
CC(C)CC(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
CC(CC(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12)CC(C)(C)C
CCCCC(CC)C(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
ClC1=CC=C2C(NCCCCNC(=O)C3CCCC3)=CC=NC2=C1
CCCCCCCCC(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
CC(C)(CCl)C(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
ClC1=CC=C2C(NCCCCNC(=O)C3CCCCC3)=CC=NC2=C1
NCCCCCCNC1=CC=NC2=CC(Cl)=CC=C12
ClC1=CC=C2C(NCCCCNC(=O)CCC3CCCC3)=CC=NC2=C1
ClC1=CC=C2C(NC3CCCCCCC3)=CC=NC2=C1
CC1CCC(CC1)NC2=CC=NC3=CC(Cl)=CC=C23
CN(C)C(=O)NCCCCNC1=CC=NC2=CC(Cl)=CC=C12
CCN(CC)CCCC(C)NC1=C2C=CC(Cl)=CC2=NC3=CC=C(OC)C=C13
CC(CCCO)NC1=CC=NC2=CC(Cl)=CC=C12
CCCCCC(=O)NCCCCCCNC1=CC=NC2=CC(Cl)=CC=C12
ClC1=CC=C2C(NC3CCC(CC3)NC(=O)CCC4CCCC4)=CC=NC2=C1
CN(C)CCCNC1=CC=NC2=CC(Cl)=CC=C12
c             S   s   g | ]}t  |¡‘qS r   )r   rf   )r   ri   r   r   r   r   M  s    ztimeit.<locals>.<listcomp>c             S   s   g | ]}t |ƒ‘qS r   )rF   )r   r   r   r   r   r   P  s    )ra   rc   z#time to compute %dx%d matrix: %.2fs)Ú
splitlinesr+   Útimer\   re   Úprint)ZmolstrZmolsZnaZnbZpathintsÚstartrA   Zapir@   ZbpirP   r   r   r   Útimeit!  s    *rw   c               @   s4   e Zd Zdd„ Zdd„ Zdd„ Zdd„ Zd	d
„ ZdS )ÚTestAtomAtomPathSimilarityc             C   s   |   dtƒ  d¡ d S )Nz%.3fz0.066)ÚassertEqualrg   )Úselfr   r   r   Ú
test_paperZ  s    z%TestAtomAtomPathSimilarity.test_paperc             C   s2   |   tddddddgdddddddgdƒd¡ d S )Nr	   r
   é   r   r   r   )ry   rI   )rz   r   r   r   Útest_getcommon]  s    z)TestAtomAtomPathSimilarity.test_getcommonc             C   s\   |   tƒ dg iddgddgddgdœd	d
ddddgdd
ddddgddddddgdœg¡ d S )Nr   iˆ  iÞ  iÔ  iäw  iy  iáƒ  )r   r   r	   ið  i~  iƒ  iò  iM•  i«  i<  i>  i  iC•  i™  i†  i
«  i•þ  )ry   rl   )rz   r   r   r   Útest_pathintegers`  s    z,TestAtomAtomPathSimilarity.test_pathintegersc          g   C   sÚ   |   tƒ ddddddddddddddddddddddddddddddddddddddddddddddddd	dddddddd
ddddddddd
ddddddddddddddddddd	dddddddddddddddgd¡ d S )Nz1.0000z0.0000z0.0345z0.0182z0.0017z0.0020z0.1126z0.0088z0.0336z0.0373z0.0645z0.0148z0.0869z0.1101z0.1767z0.0387z0.0219)ry   rq   )rz   r   r   r   Útest_AAPathComparator2Testi  s    z5TestAtomAtomPathSimilarity.test_AAPathComparator2Testc             C   s2   |   tƒ ddddddddddddddddg¡ d S )Nz1.00z0.19z0.06z0.09z0.12z0.05z0.15)ry   rr   )rz   r   r   r   Útest_tableS1y  s    z'TestAtomAtomPathSimilarity.test_tableS1N)Ú__name__Ú
__module__Ú__qualname__r{   r}   r~   r   r€   r   r   r   r   rx   X  s
   	rx   Ú__main__)r   T)r2   )NN))r:   rt   ZunittestZscipy.optimizer   Zrdkitr   Z
rdkit.Chemr   r   r   Zrdkit.Chem.Fingerprintsr   ZrdchemZBondTypeZSINGLEZDOUBLEZTRIPLEZAROMATICr4   r6   r;   r<   r)   r   rF   rI   rQ   r[   r]   r_   rd   re   rg   rl   rq   rr   rw   ZTestCaserx   r   r   r   r   r   Ú<module>   sB   




9
7&
