B
    ž3RcöP  ã               @   s¦  d Z dZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ eƒ ZddlZddlZddlZddlZdd
lmZmZ ddlmZ dd„ Zddd„Zedkr¢eƒ Ze ¡ Zejr(ejdkràe d¡ ejZye edƒZ!W n. e"k
r"   e de ¡ e #d¡ Y nX dZ!ej$sFej% &e¡d Z'e'e_$ej% (ej$¡s˜ye )ej$¡ W n0 e*k
r–   e dej$ ¡ e #d¡ Y nX eeeƒ dS )z0.13.0aŸ  
  NOTES:

    - the property names for the database are the union of those for
      all molecules.

    - missing property values will be set to 'N/A', though this can be
      changed with the --missingPropertyVal argument.
    
    - The property names may be altered on loading the database.  Any
      non-alphanumeric character in a property name will be replaced
      with '_'. e.g. "Gold.Goldscore.Constraint.Score" becomes
      "Gold_Goldscore_Constraint_Score".  This is important to know
      when querying.

    - Property names are not case sensitive in the database; this may
      cause some problems if they are case sensitive in the sd file.

      
é    N)ÚRDConfig)ÚChem)Ú	DbConnect)ÚDbModule)Úlogger)ÚLoader)ÚBuildSigFactoryÚLayeredOptions)ÚFingerprintUtilsc              C   s
  t jdtt jd} | jdddd | jddd	t d
 | jddddd | jdddd | jdddd | jdddd | jdddd | jdddd | jdd d!d | jd"d#d$d | jd%d&d'd | jd(d)d*d | jd+d,d-d | jd.tj t	j
d/d0d1¡d2d | jd3d4d5d | jd6d7d8d9d:d; | jd<d7d=d9d>d; | jd?d7d@d9dAd; | jdBd7dCd9dDd; | jdEdFdGdHdId; | jdJdFdKdHdLd; | jdMdNdOd | jdPdFdHdQdR | jdSddTdUdV | jdWdXdYd | jdZd[d\d | jd]dFdHd^dR | jd_dFdHd`dR | jdadbd9dcd7ddde | jdfdgdFdHdhdR | jdidFdHdjdR | jdkdldmd | jdndotj t	jdp¡dqd | jdrdFdHdsdR | jdtdudvd | jdwdxd7dyd9dzd; | jd{d|d}d | jd~dd€dd | jd‚dFdHdƒdR | jd„d…d†td‡dˆ | jd‰dŠd‹tdŒdˆ | S )z$ Initialize the command line parser z(CreateDB [optional arguments] <filename>)ZusageZdescriptionZformatter_classÚfilenameú?z-File containg molecules to load into database)ÚnargsÚhelpz	--versionÚversionz	%(prog)s )Úactionr   z--outDirz--dbDirÚ zname of the output directory)Údefaultr   z--molDbNamezCompounds.sqltzname of the molecule databasez--molIdNameZcompound_idzname of the database key columnz	--regNameZ	moleculesz$name of the molecular registry tablez--pairDbNamezAtomPairs.sqltzname of the atom pairs databasez--pairTableNameZ	atompairszname of the atom pairs tablez
--fpDbNamezFingerprints.sqltz$name of the 2D fingerprints databasez--fpTableNameZrdkitfpsz!name of the 2D fingerprints tablez--layeredTableNameZ
layeredfpsz&name of the layered fingerprints tablez--descrDbNamezDescriptors.sqltzname of the descriptor databasez--descrTableNameZdescriptors_v1zname of the descriptor tablez--descriptorCalcFilenameZProjectsZDbCLIzmoe_like.dscz5name of the file containing the descriptor calculatorz--errFilenamezloadErrors.txtzIname of the file to contain information about molecules that fail to loadz	--noPairsTÚdoPairsZstore_falsezskip calculating atom pairs)r   Údestr   r   z--noFingerprintsÚdoFingerprintsz skip calculating 2D fingerprintsz--noLayeredFpsÚ	doLayeredz%skip calculating layered fingerprintsz--noDescriptorsÚdoDescriptorszskip calculating descriptorsz	--noPropsFÚ	skipPropsÚ
store_truez2don't include molecular properties in the databasez
--noSmilesÚ
skipSmileszGdon't include SMILES in the database (can make loading somewhat faster)z--maxRowsCachedéÿÿÿÿz>maximum number of rows to cache before doing a database commitz--silentzdo not provide status messages)r   r   r   z--molFormat)ÚsmilesÚsdfr   z$specify the format of the input file)r   Úchoicesr   z
--namePropZ_Namez_specify the SD property to be used for the molecule names. Default is to use the mol block namez--missingPropertyValzN/AzWvalue to insert in the database if a property value is missing. Default is %(default)s.z
--addPropsz%add computed properties to the outputz
--noExtraszskip all non-molecule databasesz
--skipLoadz
--skipMolsÚloadMolsz9skip the molecule loading (assumes mol db already exists))r   r   r   r   z
--updateDbz--updatezadd to an existing databasez--doPharm2Dz%skip calculating Pharm2D fingerprintsz--pharm2DTableNameZ
pharm2dfpsz&name of the Pharm2D fingerprints tablez
--fdefFilez--fdefzNovartis1.fdefz>provide the name of the fdef file to use for 2d pharmacophoresz--doGobbi2Dz&skip calculating Gobbi 2D fingerprintsz--gobbi2DTableNameZ
gobbi2dfpsz'name of the Gobbi 2D fingerprints tablez--noMorganFpsz--noCircularFpsÚdoMorganFpsz/skip calculating Morgan (circular) fingerprintsz--morganFpTableNameZ	morganfpsz%name of the Morgan fingerprints tablez--delimiterz--delimú zthe delimiter in the input filez--titleLinez$the input file contains a title linez--smilesColumnz--smilesColr   zthe column index with smiles)r   Útyper   z--nameColumnz	--nameColé   zthe column index with mol names)ÚargparseÚArgumentParserÚ_descriptionZRawDescriptionHelpFormatterÚadd_argumentÚ_versionÚosÚpathÚjoinr   Z	RDBaseDirZ	RDDataDirÚint)Úparser© r.   ú&share/RDKit/Projects/DbCLI/CreateDb.pyÚ
initParserI   sž    








r0   r   c       1      C   sX
  |s|d krt dƒ‚| jr4ttj | j| j¡dƒ}nd }| jrhd| _d| _	d| _
d| _d| _d| _d| _| jrÖ|d krj| js$tj |¡d  ¡ }|dkr¦d| _nb|dkrd| _| jsd	d l}| ¡ }| t|d
ƒ d¡¡}|j| _| jst dt| jƒ ¡ | js$t dt| jƒ ¡ | jdkr`| jdkrBd| _tj|| j| j| j| j d}n
t !|¡}| js|t d¡ t"j#|tj | j| j$¡|| j%| j&| j'| j(| j)d| j*t+| j,ƒ| j| j-t+| j,ƒd	k| j. d | jr>t/tj | j| j0¡ƒ}| 1¡ }	y|	 2d| j3 ¡ W n t4k
r&   Y nX |	 2d| j3| j&f ¡ | j
s^| js^| js^| jrèt/tj | j| j5¡ƒ}
|
 1¡ }y| 2d| j6 ¡ W n t4k
r¦   Y nX y| 2d| j7 ¡ W n t4k
rÒ   Y nX y| 2d| j8 ¡ W n t4k
rþ   Y nX y| 2d| j9 ¡ W n t4k
r*   Y nX | j
rJ| 2d| j6| j&f ¡ | jr”d dt:j; ¡}d dd„ t<t:j;ƒD ƒ¡}| 2d| j9| j&|f ¡ | jrº| 2d| j7| j&f ¡ t=| ƒ}| jrè| 2d| j8| j&f ¡ d	dl>m?}m@} | jrPt/tj | j| j5¡ƒ}
|
 1¡ }y| 2d| jA ¡ W n t4k
r8   Y nX | 2d| jA| j&f ¡ | j	rLt/tj | j| jB¡ƒ}t| jCd
ƒ$}| ¡  Dd d!¡ Ed"¡}| F¡  W d Q R X tG HtI J|¡¡}d#d„ | K¡ D ƒ}| 1¡ }d$d%| j& g}| Ld&d„ |D ƒ¡ y| 2d| jM ¡ W n t4k
r   Y nX | 2d'| jMd |¡f ¡ d tNjOgtP|ƒ ¡}g }g }g }g }g }g }g }| jszt d(¡ t/tj | j| j$¡ƒ} |  1¡ }!| j*s¸|! 2d)| j&| j%f ¡ n|! 2d*| j&| j%f ¡ d	}"xLy,|! Q¡ }#|#d	 }$|#d+ }%|#d }&|"d+7 }"W n t4k
r   P Y nX tR|&tStTfƒr8t U|&¡}'nt UtT|&ƒ¡}'|'sPqÖ| jržtV W|'¡}(tV X|'¡})tN Y|( Z¡ ¡}*tN Y|) Z¡ ¡}+|$|%|*|+f},| [|,¡ | j
rÒtV \|'¡}-tN Y|- Z¡ ¡}&|$|%|&f},| [|,¡ | jrút: ]|'¡}.|$|%g|. },| [|,¡ | j	r(| ^|'¡}|$|%g},|, L|¡ | [|,¡ | jrb|tV__tV `|'¡}/tN Y|/ Z¡ ¡}&|$|%|&f},| [|,¡ | jrž|jatV__tV `|'¡}/tN Y|/ Z¡ ¡}&|$|%|&f},| [|,¡ | jrÒtV b|'¡}0tN Y|0 Z¡ ¡}&|$|%|&f},| [|,¡ |"d, sütP|ƒr|	 cd-| j3 |¡ g }| d¡  tP|ƒr,| cd.| j6 |¡ g }|
 d¡  tP|ƒrX| cd/| j9|f |¡ g }|
 d¡  tP|ƒr„| cd0| jM|f |¡ g }| d¡  tP|ƒr¬| cd.| j7 |¡ g }|
 d¡  tP|ƒrÔ| cd.| j8 |¡ g }|
 d¡  tP|ƒrü| cd.| jA |¡ g }|
 d¡  | jsÖ|"d, sÖt d1|" ¡ qÖW tP|ƒ	rJ|	 cd-| j3 |¡ g }| d¡  tP|ƒ	rr| cd.| j6 |¡ g }|
 d¡  tP|ƒ	rž| cd/| j9|f |¡ g }|
 d¡  tP|ƒ	rÊ| cd0| jM|f |¡ g }| d¡  tP|ƒ	rò| cd.| j7 |¡ g }|
 d¡  tP|ƒ
r| cd.| j8 |¡ g }|
 d¡  tP|ƒ
rB| cd.| jA |¡ g }|
 d¡  | j
sTt d2¡ d S )3Nz3Please provide either a data filename or a supplierzw+Fr   z.sdfr   )z.smiz.smilesz.txtz.csvr   r   ÚriÐ  zIGuessing that delimiter is %s. Use --delimiter argument if this is wrong.zJGuessing that mol format is %s. Use --molFormat argument if this is wrong.z\tú	)Ú	titleLineÚ	delimiterÚsmilesColumnÚ
nameColumnz6Reading molecules and constructing molecular database.T)ZerrorsToÚregNameZnameColr   Z
defaultValZaddComputedPropsZ	uniqNamesr   ÚmaxRowsCachedÚsilentÚnamePropZlazySupplierZ	startAnewzdrop table %szmcreate table %s (guid integer not null primary key,%s varchar not null unique,atompairfp blob,torsionfp blob)zYcreate table %s (guid integer not null primary key,%s varchar not null unique,rdkfp blob)ú,r   c             S   s   g | ]}d |d  ‘qS )zCol_%d integerr#   r.   )Ú.0Úxr.   r.   r/   ú
<listcomp>  s    zCreateDb.<locals>.<listcomp>zQcreate table %s (guid integer not null primary key,%s varchar not null unique,%s)z]create table %s (guid integer not null primary key,%s varchar not null unique,pharm2dfp blob)z]create table %s (guid integer not null primary key,%s varchar not null unique,gobbi2dfp blob))ÚGenerateÚGobbi_Pharm2Dz\create table %s (guid integer not null primary key,%s varchar not null unique,morganfp blob)z
Ú
zutf-8c             S   s   g | ]}|‘qS r.   r.   )r<   r=   r.   r.   r/   r>   "  s    z!guid integer not null primary keyz%s varchar not null uniquec             S   s   g | ]}d | ‘qS )z%s floatr.   )r<   r=   r.   r.   r/   r>   %  s    zcreate table %s (%s)z(Generating fingerprints and descriptors:z$select guid,%s,smiles,molpkl from %szselect guid,%s,molpkl from %sr#   iô  zinsert into %s values (?,?,?,?)zinsert into %s values (?,?,?)zinsert into %s values (?,?,%s)zinsert into %s values (%s)z
  Done: %dz	Finished.)eÚ
ValueErrorZerrFilenameÚopenr)   r*   r+   ÚoutDirZnoExtrasr   r   r   Z	doPharm2DZ	doGobbi2Dr   r    r   Z	molFormatÚsplitextÚlowerr4   ÚcsvZSnifferZsniffÚreadr9   r   ÚinfoÚreprr   ZSmilesMolSupplierr3   r5   r6   ZSDMolSupplierr   ZLoadDbZ	molDbNamer7   Z	molIdNamer   ZmissingPropertyValZaddPropsr   r,   r8   r:   ZupdateDbr   Z
pairDbNameZ	GetCursorZexecuteZpairTableNameÚ	ExceptionZfpDbNameZfpTableNameZpharm2DTableNameZgobbi2DTableNameZlayeredTableNamer	   ZnWordsÚranger   Zrdkit.Chem.Pharm2Dr?   r@   ZmorganFpTableNameZdescrDbNameZdescriptorCalcFilenameÚreplaceÚencodeÚcloseÚpickleÚloadÚioÚBytesIOZGetDescriptorNamesÚextendZdescrTableNamer   ZplaceHolderÚlenZfetchoneÚ
isinstanceÚbytesÚstrZMolr
   ZBuildAtomPairFPZBuildTorsionsFPZbinaryHolderZToBinaryÚappendZBuildRDKitFPZGetWordsZCalcDescriptorsÚ
sigFactoryZBuildPharm2DFPÚfactoryZBuildMorganFPZexecutemanyZCommit)1ÚoptionsÚdataFilenameZsupplierZerrFileZextrG   ZsnifferZdlctZpairConnZpairCursZfpConnZfpCursZ	layeredQsZcolDefsrZ   r?   r@   Z	descrConnZinTFZbufZcalcZnmsZ	descrCursZdescrsZ
descrQueryZpairRowsZfpRowsZlayeredRowsZ	descrRowsZpharm2DRowsZgobbi2DRowsZ
morganRowsZmolConnZmolCursÚiZtplZmolGuidZmolIdZpklZmolZpairsZtorsionsZpkl1Zpkl2ÚrowÚfp2ZwordsÚfpZmorganr.   r.   r/   ÚCreateDb¢   s    




 



































rb   Ú__main__z"please provide a filename argumentr1   zinput file %s does not existz$could not create output directory %sr#   )r   N)+r(   r&   r$   Zrdkitr   r   Zrdkit.Dbase.DbConnectionr   Zrdkit.Dbaser   Zrdkit.RDLoggerr   Zrdkit.Chem.MolDbr   Úsysr)   rR   rP   Z!rdkit.Chem.MolDb.FingerprintUtilsr   r	   r
   r0   rb   Ú__name__r-   Ú
parse_argsr\   r   r   Úerrorr]   rC   ZdataFileÚIOErrorÚexitrD   r*   rE   ÚprefixÚexistsÚmkdirrK   r.   r.   r.   r/   Ú<module>!   sR   Y
  


