B
    ž3Rcãa  ã               @   s0  d Z dZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ eƒ ZddlZddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ dd„ Zejddfdd„Zdd„ Zdd„ Zdd„ Zdd„ Zedkr,eƒ Ze ¡ Z e j!dkre j"se j#se j$se %d¡ e j!Z&de _'ee e&ƒ dS )z0.14.0aœ  
     The sd filename argument can be either an SD file or an MDL mol 
     file.
     

  NOTES:

    - The property names may have been altered on loading the
      database.  Any non-alphanumeric character in a property name
      will be replaced with '_'. e.g."Gold.Goldscore.Constraint.Score" becomes
      "Gold_Goldscore_Constraint_Score".

    - Property names are not case sensitive in the database.

 é    N)ÚRDConfig)Ú	DbConnect)Úlogger)ÚChem)ÚsupportedSimilarityMethodsÚBuildSigFactoryÚ
DepickleFPÚLayeredOptions)ÚFingerprintUtils)ÚDataStructsc             C   s,   t | ttfƒrt | ¡}nt t| ƒ¡}|S )N)Ú
isinstanceÚbytesÚstrr   ÚMol)ÚpklÚmol© r   ú&share/RDKit/Projects/DbCLI/SearchDb.pyÚ_molFromPklE   s    r   g      ð¿Fc                s,  dd„ | D ƒ‰‡fdd„t tˆƒƒD ƒ}‡fdd„|D ƒ}ddlm‰  |dkrl‡ ‡fdd„t tˆƒƒD ƒ}	n‡ fdd„t tˆƒƒD ƒ}	d}
xš|D ]\}}|
d	7 }
|s¾|
d
 s¾t d|
 ¡ |tjkr
t ||¡}x0t	|ƒD ]$\}}||krà|	||   
||¡ qàW q’|tjkrZt ||¡}x4t	|ƒD ](\}}||kr,|	||   
||¡ q,W q’|tjkrÎt| dd¡ƒ}t| dd¡ƒ}t ||||¡}xŒt	|ƒD ](\}}||kr |	||   
||¡ q W q’xTt tˆƒƒD ]D}ˆ| }|d k	rÜ|ˆ| |ƒ}||krÜ|	||   
||¡ qÜW q’W |	S )Nc             S   s   g | ]}|d  ‘qS )é   r   )Ú.0Úxr   r   r   ú
<listcomp>O   s    z$GetNeighborLists.<locals>.<listcomp>c                s   g | ]}ˆ | d k	r|‘qS )Nr   )r   r   )ÚprobeFpsr   r   r   P   s    c                s   g | ]}ˆ | ‘qS r   r   )r   r   )r   r   r   r   Q   s    r   )ÚTopNContainerc                s   g | ]}ˆ ˆƒ‘qS r   r   )r   r   )r   ÚtopNr   r   r   T   s    c                s   g | ]}ˆ d ƒ‘qS )éÿÿÿÿr   )r   r   )r   r   r   r   V   s    r   iè  z  searched %d rowsÚtverskyAg      à?ÚtverskyB)ÚrangeÚlenZrdkit.DataStructs.TopNContainerr   r   Úinfor   ÚDiceSimilarityZBulkDiceSimilarityÚ	enumerateZInsertÚTanimotoSimilarityZBulkTanimotoSimilarityÚTverskySimilarityÚfloatÚgetZBulkTverskySimilarity)Úprobesr   ZpoolÚ	simMetricÚ	simThreshÚsilentÚkwargsZvalidProbesZvalidFpsÚnbrListsÚnDoneÚnmÚfpÚscoresÚiÚscoreÚavZbvZpfpr   )r   r   r   r   ÚGetNeighborListsM   sH    



r5   c       	   	   c   s‚   t | dƒ}xrt|ƒD ]f\}}y| ¡  d¡\}}W n tk
rH   wY nX t |¡}|sn|rt||||d q|||fV  qW d S )NÚrú )Úfile)Úopenr#   ÚstripÚsplitÚ
ValueErrorr   ÚMolFromSmilesÚprint)	ÚdataFilenameÚerrFileÚnamePropZdataFileÚidxÚlineÚsmir/   Úmr   r   r   ÚGetMolsFromSmilesFilex   s    

rF   c       	      c   s    t  | ¡}xt|ƒD ]„\}}|sP|rt|dƒrD| |¡}| |¡ qt d¡ qt  |d¡}| 	|¡r€| 
|¡}|sŒt d¡ nd|d  }|||fV  qW d S )NÚGetItemTextz$full error file support not completeTz'molecule found with empty name propertyzMol_%dr   )r   ZSDMolSupplierr#   ÚhasattrrG   Úwriter   ZwarningÚMolToSmilesZHasPropZGetProp)	r?   r@   rA   ZsupplrB   rE   ÚdrD   r/   r   r   r   ÚGetMolsFromSDFile‡   s     





rL   c       F         s†  ˆj dkr8tj}tj}tj ˆjˆj	¡}ˆj
}ˆj}nVˆj dkrptj}tj}tj ˆjˆj¡}ˆj}ˆj}nˆj dkr²tj}tj}tj ˆjˆj¡}ˆj}ˆjsªdˆ_ˆj}nÜˆj dkrþtj}tj}tj ˆjˆj¡}ˆj}ˆjsìdˆ_ˆj}tˆƒt_nˆj dkrXdd	lm} tj}tj}tj ˆjˆj¡}ˆj}ˆjsHd
ˆ_ˆj}|jt_n6ˆj dkrŽtj}tj}tj ˆjˆj¡}ˆj }ˆj!}i }ˆj"dkr¦tj}n:ˆj"dkrºtj}n&ˆj"dkràtj#}ˆj$|d< ˆj%|d< ˆj&rt' (ˆj&¡}	|	st) *dˆj& ¡ t+ ,d¡ |	ˆ_-n:ˆj.rVt' /ˆj.¡}	|	sPt) *dˆj. ¡ t+ ,d¡ |	ˆ_-ˆj0dkrjt+j1}
nˆj0dkr|d }
nt2ˆj0dƒ}
d}ˆj3rºd}ˆj3dkr¬t+j1}nt2ˆj3dƒ}nd }ˆj4rìd}ˆj4dkrÞt+j1}nt2ˆj4dƒ}nd }|r$yt2|dƒ}W n. t5k
r2   t) *d| ¡ t+ ,d¡ Y nX ˆj6dkrFt7}nˆj6dkrVt8}ˆj9szd}|rp|d7 }t) :|¡ g }d}g }x ||d ˆj;ƒD ]ˆ\}}}	|d7 }| <|¡ |	sÔt) *d | ¡ | <d!¡ q–|rî| <|	||	ƒf¡ n| <|	d f¡ ˆj9s–|d" s–t) :d#| ¡ q–W nd }d }ˆj=}d }d }tj ˆjˆj>¡}ˆj=}t?|ƒ}d$d%„ | @d&¡D ƒ}|d \}}ˆjAsŠˆj-rt?|ƒ}| B¡ } ˆj-r¶ˆj9s´t) :d'¡ ˆjArÈd(ˆjA }!nd}!ˆj9sð|  Cd)tDƒ  ¡ |  E¡ d }"d}#d}$tj ˆjˆj¡}%tj F|%¡ržˆjGsž|  Cd*|% ¡ y|  Cd+ˆjH ¡ W n tIk
rX   Y nFX d}$d,ˆjH|f }#tJ Kˆj-¡}&|&rž|!sŠd-}!n|!d.7 }!|!d/|& 7 }!d0tDƒ  }'|  C|'¡ |  E¡ }(d})g }x¸|(rz|(\}*}+ˆjLsätM|+ƒ},nt' NtO P|+¡¡},|, Qˆj-¡}-ˆjGr|- }-|-r| <|*¡ |)d7 })ˆj9sn|)d1 sn|$sXt) :d2|)|"tR|ƒf ¡ nt) :d3|)tR|ƒf ¡ |  E¡ }(qÄW ˆj9s|$r|"r|"|) }.t) :d4|.|"d5|. |" f ¡ nLˆjArˆj9sÐt) :d6¡ ˆjA Sd7¡d }/|  Cd8tDƒ  ¡ d9d%„ |  T¡ D ƒ}ˆj9st) :d:tR|ƒ ¡ tU U¡ }0|
rfˆj9s<t) :d;¡ t?|ƒ}| V|¡}| B¡ } |r’d<d%„ |D ƒ}|  Cd=tDƒ  ¡ |  Wd>|¡ d?tDƒ  }#nd}#|d  X¡ | X¡ krÎ|  Cd@tDƒ  ¡ |  CdAtDƒ  ¡ n|  CdBtDƒ  ¡ dCdD„ }1tY|ˆjZ|1| ˆj ƒf|ˆj[dEœ|—Ž}2t\ƒ }3i }4xŒt]|ƒD ]€\}}|2|  ^¡  |2|  _¡ }5|2|  `¡ }6g }7x@t]|6ƒD ]4\}8}9|9d krpP n|3 a|9¡ |7 <|9|5|8 f¡ qZW |7|4||f< q W tU U¡ }:ˆj9sÆt) :dF|:|0  ¡ ˆj9sØt) :dG¡ | B¡ } tb|3ƒ}dHd%„ |D ƒ}|  Cd=tDƒ  ¡ |  Wd>|¡ |  CdItDƒ  ¡ i ‰ x"|  T¡ D ]\};}*tc|*ƒˆ |;< 	q0W tb|4 d¡ ƒ}<|< e¡  ˆjf	s¼xü|<D ]J\}}|4||f }7ˆjg |g‡ ‡fdJd%„|7D ƒ ¡}=|
	rlth|=|
dK 	qlW n¨‡fdLd%„|<D ƒ}>|
	rèthˆjg |>¡|
dK xztiˆjZƒD ]l}g }?xF|<D ]>\}@}|4|@|f | }A|? <ˆ |Ad  ¡ |? <dM|Ad  ¡ 
qW |
	rôthˆjg |?¡|
dK 	qôW n¢ˆj9
sxt) :dG¡ | B¡ } dNd%„ t\|ƒD ƒ}|  Cd=tDƒ  ¡ |  Wd>|¡ ˆj=}|  CdItDƒ  ¡ i ‰ x"|  T¡ D ]\};}*tc|*ƒˆ |;< 
qÒW |
rthdO ˆ  j¡ ¡|
dK |rp|rptj ˆjˆj>¡}dPd%„ | Vd&¡D ƒ}|d dQkr\| kdQ¡ | <dQ¡ | B¡ } dR |¡}B|  CdStDƒ  ¡ |  E¡ }(i }Cxä|(rntb|(ƒ}(tM|(d ƒ},|(d };ˆ |; }|r,|, ldT|¡ tht' m|,¡|dK xDtidtR|ƒd ƒD ].}|| }Dtc|(| ƒ}Eth|? dU|D|Ef f qîW thdV|dK |r@t' n|,ˆjo¡}|rbthdW|tc|(d ƒf |dK |  E¡ }(qŒW ˆj9s‚t) :dX¡ d S )YNZ	AtomPairsZTopologicalTorsionsÚRDKZrdkfpZPharm2DZ	pharm2dfpZGobbi2Dr   )ÚGobbi_Pharm2DZ	gobbi2dfpZMorganÚtanimotoÚdiceÚtverskyr   r   z/could not build query molecule from smiles "%s"r   z/could not build query molecule from smarts "%s"ú-Ú zw+FTr6   zcould not open query file %sr   ÚsmilesÚsdfzReading query moleculesz and generating fingerprintsz$query molecule %d could not be built)NNiè  z	  done %dc             S   s   g | ]\}}|  ¡ |f‘qS r   )Úlower)r   r   Úyr   r   r   r   (  s    zRunSearch.<locals>.<listcomp>Ú	moleculeszDoing substructure queryzwhere %sz(select count(*) from molecules %(where)szattach database '%s' as fpdbzselect * from fpdb.%s limit 1zjoin fpdb.%s using (%s)Úwherez andr7   z9select %(idCol)s,molpkl from molecules %(join)s %(where)siô  z/  searched %d (of %d) molecules; %d hits so farz/  searched through %d molecules; %d hits so farz0   Fingerprint screenout rate: %d of %d (%%%.2f)g      Y@zDoing property queryú;z3select %(idCol)s from molecules where %(propQuery)sc             S   s   g | ]}|d  ‘qS )r   r   )r   r   r   r   r   r   n  s    z%Found %d molecules matching the queryzFinding Neighborsc             S   s   g | ]
}|f‘qS r   r   )r   r   r   r   r   r   {  s    z4create temporary table _tmpTbl (%(idCol)s %(idTyp)s)zinsert into _tmpTbl values (?)zjoin  _tmpTbl using (%(idCol)s)z'attach database '%(molDbName)s' as molszš
  select %(idCol)s,%(fpColName)s from %(fpTableName)s join
      (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s)
    using (%(molIdName)s)
z<select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)sc             s   s:   |   ¡ }x,|r4|\}}t||ƒ}||fV  |   ¡ }q
W d S )N)Úfetchoner   )ÚcursZsimilarityMethodÚrowÚidr   r0   r   r   r   ÚpoolFromCursŽ  s    

zRunSearch.<locals>.poolFromCurs)r)   r*   zThe search took %.1f secondszCreating outputc             S   s   g | ]
}|f‘qS r   r   )r   r   r   r   r   r   °  s    zLselect %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)c                s$   g | ]\}}d ˆ | ˆj |f ‘qS )z%s%s%.3f)ÚoutputDelim)r   r^   r3   )ÚnmDictÚoptionsr   r   r   ¾  s   )r8   c                s   g | ]}d |d ˆ j f ‘qS )z%s%sSimilarityr   )r`   )r   r   )rb   r   r   r   Ã  s    z%.3fc             S   s   g | ]
}|f‘qS r   r   )r   r   r   r   r   r   Ò  s    Ú
c             S   s   g | ]}|  ¡ ‘qS r   )rV   )r   r   r   r   r   r   ß  s    Úmolpklú,z?select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)Ú_Namez
> <%s>
%s
z$$$$z%s %szDone!)pZsimilarityTyper
   ZBuildAtomPairFPr   r"   ÚosÚpathÚjoinZdbDirZ
pairDbNameZpairTableNameZpairColNameZBuildTorsionsFPZtorsionsDbNameZtorsionsTableNameZtorsionsColNameZBuildRDKitFPZFingerprintSimilarityÚfpDbNameÚfpTableNameÚ	fpColNameZBuildPharm2DFPZpharm2DTableNamer   Z
sigFactoryZrdkit.Chem.Pharm2DrN   r$   Zgobbi2DTableNameÚfactoryZBuildMorganFPZmorganFpDbNameZmorganFpTableNameZmorganFpColNameZsimilarityMetricr%   r   r   ÚsmilesQueryr   r=   r   ÚerrorÚsysÚexitÚqueryMolÚsmartsQueryZMolFromSmartsÚoutFÚstdoutr9   ÚsdfOutÚ	smilesOutÚIOErrorZ	molFormatrF   rL   r+   r!   rA   ÚappendÚ	molIdNameÚ	molDbNamer   ZGetColumnNamesAndTypesÚ	propQueryZ	GetCursorZexecuteÚlocalsr[   ÚexistsZnegateQueryZlayeredTableNameÚ	Exceptionr	   ZGetQueryTextZzipMolsr   r   ÚzlibZ
decompressZHasSubstructMatchr    r;   ZfetchallÚtimeZGetColumnNamesZexecutemanyrV   r5   r   r*   Úsetr#   ÚreverseZGetPtsZ	GetExtrasÚaddÚlistr   ÚkeysÚsortZ	transposer`   r>   r   ÚvaluesÚremoveZSetPropZMolToMolBlockrJ   ÚchiralSmiles)Frb   ÚqueryFilenameZ	fpBuilderr)   ZdbNamerk   rl   rN   Z	extraArgsr   rt   ZmolsOutrv   rw   ZtmpFÚfuncÚmsgr(   r2   Znmsr/   rD   ZconnZidNameZidsÚnamesr{   rz   ZmConnZcnsZidColZidTypr\   rY   ZnToDori   ZdoSubstructFPsrj   ZqueryÚcmdr]   r.   r^   rd   rE   ZmatchedZ	nFilteredr|   Zt1r_   Z	topNListsZuniqIdsr-   r1   ZnbrNamesZnbrsÚjZnbrGuidZt2ZguidZksZnbrTxtÚlabelsZoutLrB   ZnbrZcnTextZmolDZpnZpvr   )ra   rb   r   Ú	RunSearch   sZ   






























r’   c              C   s@  t jdtt jd} | jdddd | jddd	t d
 | jdddd | jdddd | jdddd | jdddd | jdddd | jdddd | jddd d | jd!dd"d | jd#dd$d | jd%d&d d | jd'd(d)d | jd*d+d,d | jd-d.d/d | jd0dd1d | jd2d3d4d | jd5d6d7d | jd8tj t	j
d9d:d;¡d<d | jd=d>d?d | jd@dAtdBdC | jdDdEdFdGd | jdHdIdJdKdL | jdMdNdOdPdQ | jdRdSdTd | jdUdVdWddXd | jdYdZd[dd\d | jd]d^dIdJd_dL | jd`dadbddcd | jdddeddfd | jdgdhddid | jdjdkdldmdndo | jdpdIdJdqdL | jdrdsdIdJdtdL | jdudvdwd | jdxdytj t	jdz¡d{d | jd|d}d~d | jdd€dd‚tdƒdQ | jd„d(d…d | jd†d‡dˆd | jd‰dŠd‹d | jdŒddŽddddQ | jd‘d’td“dC | jd”d’td•dC | jd–d—td˜dC | S )™z$ Initialize the command line parser z+SearchDB [optional arguments] <sdffilename>)ZusageZdescriptionZformatter_classÚfilenameú?z%File containg molecules for searching)ÚnargsÚhelpz	--versionÚversionz	%(prog)s )Úactionr—   z--dbDirrS   z_name of the directory containing the database information. The default is the current directory)Údefaultr–   z--molDbNamezCompounds.sqltzname of the molecule databasez--molIdNameZcompound_idzname of the database key columnz	--regNamerX   z$name of the molecular registry tablez--pairDbNamezAtomPairs.sqltzname of the atom pairs databasez--pairTableNameZ	atompairszname of the atom pairs tablez--pairColNameZ
atompairfpzname of the atom pair columnz--torsionsDbNamezWname of the topological torsions database (usually the same as the atom pairs database)z--torsionsTableNamezQname of the topological torsions table (usually the same as the atom pairs table)z--torsionsColNameZ	torsionfpz
--fpDbNamezFingerprints.sqltz$name of the 2D fingerprints databasez--fpTableNameZrdkitfpsz!name of the 2D fingerprints tablez--layeredTableNameZ
layeredfpsz&name of the layered fingerprints tablez--fpColNamez=name of the 2D fingerprint column, a sensible default is usedz--descrDbNamezDescriptors.sqltzname of the descriptor databasez--descrTableNameZdescriptors_v1zname of the descriptor tablez--descriptorCalcFilenameZProjectsZDbCLIzmoe_like.dscz5name of the file containing the descriptor calculatorz--outputDelimre   z=the delimiter for the output file. The default is %(default)sz--topNé   zSthe number of neighbors to keep for each query compound. The default is %(default)s)r™   Útyper–   z--outFz	--outFilerR   zAThe name of the output file. The default is the console (stdout).z--transposeFÚ
store_truezaprint the results out in a transposed form: e.g. neighbors in rows and probe compounds in columns)r™   r˜   r–   z--molFormatrU   )rT   rU   z$specify the format of the input file)r™   Úchoicesr–   z
--nameProprf   z_specify the SD property to be used for the molecule names. Default is to use the mol block namez--smartsQueryz--smartsz--smaz3provide a SMARTS to be used as a substructure queryz--smilesQueryz--smilesz--smiz3provide a SMILES to be used as a substructure queryz--negateQueryz--negatez'negate the results of the smarts query.z--propQueryz--queryz-qz<provide a property query (see the NOTE about property names)z--sdfOutz--sdOutz-export an SD file with the matching moleculesz--smilesOutz--smiOutz0export a smiles file with the matching moleculesz--nonchiralSmilesrŠ   TZstore_falsez&do not use chiral SMILES in the output)Údestr™   r˜   r–   z--silentz Do not generate status messages.z	--zipMolsz--zipz&read compressed mols from the databasez--pharm2DTableNameZ
pharm2dfpsz&name of the Pharm2D fingerprints tablez
--fdefFilez--fdefzNovartis1.fdefz>provide the name of the fdef file to use for 2d pharmacophoresz--gobbi2DTableNameZ
gobbi2dfpsz&name of the Gobbi2D fingerprints tablez--similarityTypez	--simTypez--simrM   z˜Choose the type of similarity to use, possible values: RDK, AtomPairs, TopologicalTorsions, Pharm2D, Gobbi2D, Avalon, Morgan. The default is %(default)sz--morganFpDbNamez(name of the morgan fingerprints databasez--morganFpTableNameZ	morganfpsz%name of the morgan fingerprints tablez--morganFpColNameZmorganfpz%name of the morgan fingerprint columnz--similarityMetricz--simMetricz--metric)rO   rP   rQ   rS   zChoose the type of similarity to use, possible values: tanimoto, dice, tversky. The default is determined by the fingerprint typez
--tverskyAg      à?zTversky A valuez
--tverskyBzTversky B valuez--simThreshr   zYthreshold to use for similarity searching. If provided, this supersedes the topN argument)ÚargparseÚArgumentParserÚ_descriptionZRawDescriptionHelpFormatterÚadd_argumentÚ_versionrg   rh   ri   r   Z	RDBaseDirÚintZ	RDDataDirr   r&   )Úparserr   r   r   Ú
initParser  s¸    






r¦   Ú__main__zLplease either provide a query filename argument or do a data or smarts query)(r£   r¡   rg   rŸ   rp   r   Zrdkitr   Zrdkit.Dbase.DbConnectionr   Zrdkit.RDLoggerr   r€   r   Z!rdkit.Chem.MolDb.FingerprintUtilsr   r   r   r	   Zrdkit.Chem.MolDbr
   r   r   r"   r5   rF   rL   r’   r¦   Ú__name__r¥   Ú
parse_argsrb   r“   rn   rs   r|   ro   r‹   rr   r   r   r   r   Ú<module>#   s<   *  hj
$
