B
    3Rc_                 @   s6  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddl
mZ ddl
mZ dd	l
mZ dd
lmZ dZdZeedryddlmZmZ W n ek
r   Y nX ddlmZ ddlmZ eeZi ZdZdZdZ dZ!dd Z"dd Z#dd Z$y"ddl%m&Z& e$e&_$e!e&_!ee&_W n  e'k
rP   e(d Y nX dSddZ)yddl*Z+W n ek
r~   Y nX dTd!d"Z,dUd#d$Z-d%d& Z.ydd'l/m0Z0 d(d) Z1W n ek
r   d*d) Z1Y nX d+d, Z2dVd.d/Z3dWd1d2Z4dXd3d4Z5da6dYd5d6Z7dZd8d9Z8d[d;d<Z9d\d=d>Z:d]d@dAZ;dBdC Z<d^dDdEZ=dFdG Z>dHdI Z?d_dJdKZ@e>  edLkr2ddlAZAyddlBZBW n ek
r   dZBY nX yddl*Z+dMdN ZCW n ek
r   dZ+Y nX G dOdP dPeAjDZDdZEye+ eC dQk  ZEW n eFk
r   d ZEY nX eEr(e@  eAG  n
e(dR dS )`a  
Importing pandasTools enables several features that allow for using RDKit molecules as columns of a
Pandas dataframe.
If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example:

>>> from rdkit.Chem import PandasTools
>>> import pandas as pd
>>> import os
>>> from rdkit import RDConfig
>>> antibiotics = pd.DataFrame(columns=['Name','Smiles'])
>>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C',
...   'Name':'Penicilline G'}, ignore_index=True)#Penicilline G
>>> antibiotics = antibiotics.append({
...   'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O',
...   'Name':'Tetracycline'}, ignore_index=True)#Tetracycline
>>> antibiotics = antibiotics.append({
...   'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C',
...   'Name':'Ampicilline'}, ignore_index=True)#Ampicilline
>>> print([str(x) for x in  antibiotics.columns])
['Name', 'Smiles']
>>> print(antibiotics)
            Name                                             Smiles
0  Penicilline G    CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
1   Tetracycline  CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
2  Ampicilline  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...

a new column can be created holding the respective RDKit molecule objects. The fingerprint can be
included to accelerate substructure searches on the dataframe.

>>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True)
>>> print([str(x) for x in  antibiotics.columns])
['Name', 'Smiles', 'Molecule']

A substructure filter can be applied on the dataframe using the RDKit molecule column,
because the ">=" operator has been modified to work as a substructure check.
Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by

>>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1')
>>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam]
>>> print(beta_lactam_antibiotics[['Name','Smiles']])
            Name                                             Smiles
0  Penicilline G    CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
2  Ampicilline  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...


It is also possible to load an SDF file can be load into a dataframe.

>>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
>>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',
...            includeFingerprints=True)
>>> frame.info # doctest: +SKIP
<bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Data columns:
AMW                       200  non-null values
CLOGP                     200  non-null values
CP                        200  non-null values
CR                        200  non-null values
DAYLIGHT.FPG              200  non-null values
DAYLIGHT_CLOGP            200  non-null values
FP                        200  non-null values
ID                        200  non-null values
ISM                       200  non-null values
LIPINSKI_VIOLATIONS       200  non-null values
NUM_HACCEPTORS            200  non-null values
NUM_HDONORS               200  non-null values
NUM_HETEROATOMS           200  non-null values
NUM_LIPINSKIHACCEPTORS    200  non-null values
NUM_LIPINSKIHDONORS       200  non-null values
NUM_RINGS                 200  non-null values
NUM_ROTATABLEBONDS        200  non-null values
P1                        30  non-null values
SMILES                    200  non-null values
Molecule                  200  non-null values
dtypes: object(20)>

The standard ForwardSDMolSupplier keywords are also available:

>>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
>>> frame = PandasTools.LoadSDF(sdfFile, smilesName='SMILES', molColName='Molecule',
...            includeFingerprints=True, removeHs=False, strictParsing=True)

Conversion to html is quite easy:

>>> htm = frame.to_html() # doctest:
...
>>> str(htm[:36])
'<table border="1" class="dataframe">'

In order to support rendering the molecules as images in the HTML export of the
dataframe, we use a custom formatter for columns containing RDKit molecules,
and also disable escaping of HTML where needed.
    )	b64encodeN)Chem)DataStructs)AllChem)Draw)SDWriter)rdchem)MurckoScaffoldZIPythonConsole)InteractiveRendererdrawOptions)BytesIO)minidomTZpng)   r   centerc             C   sv   | dks|dkrdS t | drHt |ds4t|d|_t|j| jsHdS | |}g | _|rntrjt|| _dS dS dS )a*  Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by
    monkey-patching the __ge__ function
    This has the effect that the pandas/numpy rowfilter can be used for substructure filtering
    (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule])
    NF_substructfpT)	hasattr_fingerprinterr   r   ZAllProbeBitsMatchZGetSubstructMatch
__sssAtomshighlightSubstructureslist)xymatch r   5lib/python3.7/site-packages/rdkit/Chem/PandasTools.py_molge   s    



r   c             C   s(   t tds ttdd td t| S )z*Deprecated, use PrintAsImageString insteadZwarning_shownTzEPrintAsBase64PNGString is deprecated - use PrintAsImageString instead)r   PrintAsBase64PNGStringsetattrlogwarningPrintAsImageString)r   r   r   r   r      s    

r   c          	   C   s  t rt| dr| j}ng }t dk}trRt| rRdd tD }tj| ||dS |rt	j
| t|ddtd}t|}|dd	 }d
td	  dtd  |jd< dtd	  dtd  d|jd< d|jd< | S t	j| t|dddtd}dtd	  dtd  dt| S dS )z;Returns the molecules as base64 encoded PNG image or as SVGr   svgc             S   s   g | ]}t d |qS )   )max).0sr   r   r   
<listcomp>   s    z&PrintAsImageString.<locals>.<listcomp>)useSVG T)kekulizer   r   z0 0     Zviewboxzmax-width: zpx; height: zpx;Zstylezrdkit/moleculezdata-content)Z	returnPNGr)   r   z<div style="width: zWpx" data-content="rdkit/molecule"><img src="data:image/png;base64,%s" alt="Mol"/></div>N)r   r   r   molRepresentationlowerr
   Z	isEnabledmolSizeZgenerateHTMLBodyr   Z	_moltoSVGr   r   ZparseStringZgetElementsByTagNameZ
attributesZtoxmlZ	_moltoimg
_get_image)r   ZhighlightAtomsr'   sizer!   datar   r   r   r       s$    
 
r    r+   )PandasPatcherzDFailed to patch pandas - PandasTools will have limited functionalityc             C   s2   yt |  W n tk
r,   td Y nX dS )aN  Changes the default dataframe rendering to not escape HTML characters, thus allowing
    rendered images in all dataframes.
    IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want
    to change the rendering only for a single dataframe use the "ChangeMoleculeRendering" method
    instead.
    z<Failed to patch pandas - unable to change molecule renderingN)r2   ZrenderImagesInAllDataFrames	NameErrorr   r   )Zimagesr   r   r   RenderImagesInAllDataFrames   s    r4   IDROMolFc	          	      s  t | trF|  dd dkr4ddl}	|	| d}
n
t| d}
|
j}n| }
d}g }g }t|dk	pf|dk	}xttj	|
|||dD ]\}  dkrqt
 fdd  D }|dk	r|sx  D ]} | qW  d	r d	||< |dk	r<ytj |d
||< W n* tk
r:   td| d||< Y nX |dk	rV|sV ||< n|dk	rlt ||< || || qW |dk	r|  tj||d}t| |S )a  Read file in SDF format and return as Pandas data frame.
      If embedProps=True all properties also get embedded in Mol objects in the molecule column.
      If molColName=None molecules would not be present in resulting DataFrame (only properties
      would be read).
      Nz.gzr   rb)sanitizeremoveHsstrictParsingc             3   s   | ]}|  |fV  qd S )N)GetProp)r$   k)molr   r   	<genexpr>  s    zLoadSDF.<locals>.<genexpr>_Name)isomericSmilesz2No valid smiles could be generated for molecule %s)index)
isinstancestrr-   gzipopenclosebool	enumerater   ZForwardSDMolSupplierdictZGetPropNamesZ	ClearPropZHasPropr<   MolToSmiles	Exceptionr   r   _MolPlusFingerprintappendpd	DataFrameChangeMoleculeRendering)filenameidName
molColNameincludeFingerprintsrA   Z
smilesNameZ
embedPropsr:   r;   rE   frG   Zrecordsindicesr9   irowZpropdfr   )r>   r   LoadSDF   sN    








r[   c             C   s   dgt |   }|rtddlm} xP|  D ]D\}}|dkr>q,x0t|D ]$\}}	t|	||< |||  qHW q,W |s|	d | d= || d< t
j| |d}
|
S )a   returns a dataframe with the results of R-Group Decomposition

    >>> from rdkit import Chem
    >>> from rdkit.Chem import rdRGroupDecomposition
    >>> from rdkit.Chem import PandasTools
    >>> import pandas as pd
    >>> scaffold = Chem.MolFromSmiles('c1ccccn1')
    >>> mols = [Chem.MolFromSmiles(smi) for smi in 'c1c(F)cccn1 c1c(Cl)c(C)ccn1 c1c(O)cccn1 c1c(F)c(C)ccn1 c1cc(Cl)c(F)cn1'.split()]
    >>> groups,_ = rdRGroupDecomposition.RGroupDecompose([scaffold],mols,asSmiles=False,asRows=False)
    >>> df = PandasTools.RGroupDecompositionToFrame(groups,mols,include_core=True)
    >>> list(df.columns)
    ['Mol', 'Core', 'R1', 'R2']
    >>> len(df)
    5
    >>> df.columns() # doctest: +SKIP
    <class 'pandas*...*DataFrame'>
    RangeIndex: 5 entries, 0 to 4
    Data columns (total 4 columns):
    Mol     5 non-null object
    Core    5 non-null object
    R1      5 non-null object
    R2      5 non-null object
    dtypes: object(4)
    memory usage: *...*

    Molr   )
rdDepictorZCore)columns)r   keys
rdkit.Chemr]   itemsrI   r   ZRemoveHsCompute2DCoordsremoverO   rP   )groupsZmolsZinclude_coreZredraw_sidechainscolsr]   r=   ZvlrX   vframer   r   r   RGroupDecompositionToFrame&  s    
rh   c             C   s   t | dS )z!displayhook function for PNG dataascii)r   decode)r   r   r   r   r/   S  s    r/   )pyAvalonToolsc             C   s   t j| |t jdS )N)ZisQueryZbitFlags)rk   ZGetAvalonFPZavalonSSSBits)r   r   r   r   r   r   ^  s    r   c             C   s   t j| ddS )Ni   )ZfpSize)r   ZPatternFingerprint)r   r   r   r   r   r   b  s    c             C   s   | dk	rt | d| _| S )zoPrecomputes fingerprints and stores results in molecule objects to accelerate
       substructure matching
    NF)r   r   )mr   r   r   rM   f  s    rM   Smilesc             C   s<   |s| |  tj| |< n| |  dd | |< t|  dS )a  Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the
    dataframe "frame" using the specified column name.
    If desired, a fingerprint can be computed and stored with the molecule objects to accelerate
    substructure matching
    c             S   s   t t| S )N)rM   r   MolFromSmiles)smilesr   r   r   <lambda>y      z*AddMoleculeColumnToFrame.<locals>.<lambda>N)mapr   rn   rQ   )rg   Z	smilesColmolColrU   r   r   r   AddMoleculeColumnToFrameo  s
    rt   imagec             C   s@   | dkrdS yt | | W n tk
r:   td Y nX dS )ag  Allows to change the rendering of the molecules between image and string
    representations.
    This serves two purposes: First it allows to avoid the generation of images if this is
    not desired and, secondly, it allows to enable image rendering for newly created dataframe
    that already contains molecules, without having to rerun the time-consuming
    AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head()
    returns a new dataframe instance that uses the default pandas rendering (thus not drawing
    images for molecules) instead of the monkey-patched one.
    Nz<Failed to patch pandas - unable to change molecule rendering)r2   ZchangeMoleculeRenderingr3   r   r   )rg   Zrendererr   r   r   rQ   }  s    rQ   c                s  d}t |tr<| dd dkr<ddl}||d}|j}t|}|dkrRg }nt|}|r|| fdd j	
 D  ||kr|| ||kr|| || x  D ]}	t|	d | }
|dk	r
|d	kr|
d
t|	d  n|
d
t|	d |  xr|D ]j}|	d | }tt|tjrhd|d}|d dkrZ|d7 }|
|| n|
|t| qW ||
 qW |  |dk	r|  dS )a  Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as
    SDF tags if specified in the "properties" list. "properties=list(df.columns)" would export
    all columns.
    The "allNumeric" flag allows to automatically include all numeric columns in the output.
    User has to make sure that correct data type is assigned to column.
    "idName" can be used to select a column to serve as molecule title. It can be set to
    "RowID" to use the dataframe row key as title.
    Nr7   z.gzr   Zwtc                s8   g | ]0}t  j| t js0t  j| t jr|qS r   )np
issubdtypedtypesfloatingZinteger)r$   Zdt)rZ   r   r   r&     s    zWriteSDF.<locals>.<listcomp>r+   ZRowIDr@   z{:f}0.)rC   rD   r-   rE   rF   rG   r   r   extendrx   r_   rc   ZSetPropsiterrowsr   r\   SetProprv   rw   typery   formatrstripwrite)rZ   outrT   rS   Z
propertiesZ
allNumericrG   rE   writerrY   r>   pZ
cell_valuer%   r   )rZ   r   WriteSDF  sH    	






r   c                s:   t dkrddlm} | a | j fdddd|  < dS )zD
    Removes salts from mols in pandas DataFrame's ROMol column
    Nr   )SaltRemoverc                s   t |   S )N)_saltRemoverZStripMol)r   )rs   r   r   rp     rq   z&RemoveSaltsFromFrame.<locals>.<lambda>r+   )axis)r   r`   r   apply)rg   rs   r   r   )rs   r   RemoveSaltsFromFrame  s    r   r(   c             C   s   t j||d}|dkr`x>t| | dd | | D D ]\}}|d| || q4W |  n$x| | D ]}|| qjW |  dS )zs
    Saves smi file. SMILES are generated from column with RDKit molecules. Column
    with names is optional.
    )rA   r(   c             s   s   | ]}t |V  qd S )N)rD   )r$   cr   r   r   r?     s    z&SaveSMILESFromFrame.<locals>.<genexpr>r@   N)r   ZSmilesWriterzipr   r   rG   )rg   outFilers   ZNamesColrA   wrl   nr   r   r   SaveSMILESFromFrame  s    &
r   i,  i,  c          	   C   s  ddl }t| j}|| t| j}||}| }|d|d d  d}	x"|D ]}
|	d|	|
 |	d7 }	qVW d}x2| 
 D ]$\}}t }tj|| |d}|j|dd |j||d d	 ||dd
d|i d}	x|D ]}
t||
 dkr|	||	t||
 dd  n|dt||
 ks>dt||
 krr||
 tjks^||
 tjkr|||	||
  n$dt||
 kr|||	||
  |	d7 }	qW |d7 }qW |  |  dS )a  
      Saves pandas DataFrame as a xlsx file with embedded images.
      It maps numpy data types to excel cell types:
      int, float -> number
      datetime -> datetime
      object -> string (limited to 32k character - xlsx limitations)

      Cells with compound images are a bit larger than images due to excel.
      Column width weirdness explained (from xlsxwriter docs):
      The width corresponds to the column width value that is specified in Excel.
      It is approximately equal to the length of a string in the default font of Calibri 11.
      Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
      This feature is only available at runtime from within Excel.
      r   NzA:Ag      @r+   )r0   ZPNG)r   )ZheightrV   
image_dataobjecti }  floatintZdatetime)
xlsxwriterr   r^   rc   rJ   rx   ZWorkbookZadd_worksheetZ
set_columnZwrite_stringr~   r   r   Z
MolToImageZsaveZset_rowZinsert_imagerD   rv   naninfZwrite_numberZwrite_datetimerG   )rg   r   rs   r0   r   re   Z	dataTypesZworkbookZ	worksheetZc2r   r   _rY   r   Zimgr   r   r   SaveXlsxFromFrame  s>    





 $ r   c             K   sR   |r<|| j jkr&dd | j D |d< ndd | | D |d< tjt| | f|S )z6
    Draw grid image of mols in pandas DataFrame.
    c             S   s   g | ]}t |qS r   )rD   )r$   r   r   r   r   r&   (  s    z$FrameToGridImage.<locals>.<listcomp>Zlegendsc             S   s   g | ]}t |qS r   )rD   )r$   r   r   r   r   r&   *  s    )rB   namer   ZMolsToGridImager   )rg   columnZ
legendsColkwargsr   r   r   FrameToGridImage"  s
    r   Murcko_SMILESc                s4   |r fdd}n fdd}| j |dd| |< dS )z
    Adds column with SMILES of Murcko scaffolds to pandas DataFrame.

    Generic set to true results in SMILES of generic framework.
    c                s   t tt|   S )N)r   rK   r	   ZMakeScaffoldGenericGetScaffoldForMol)r   )rs   r   r   func6  s    zAddMurckoToFrame.<locals>.funcc                s   t t|   S )N)r   rK   r	   r   )r   )rs   r   r   r   ;  s    r+   )r   N)r   )rg   rs   Z	MurckoColZGenericr   r   )rs   r   AddMurckoToFrame.  s    r   c             C   s$   t |}t| t| | | S )zC
    Aligns mol (RDKit mol object) to scaffold (SMILES string)
    )r   rn   r   rb   Z$GenerateDepictionMatching2DStructure)r>   Zscaffoldr   r   r   AlignMolA  s    

r   c                s    | j  fdddd|  < dS )z@
    Aligns molecules in molCol to scaffolds in scaffoldCol
    c                s   t |   |  S )N)r   )r   )rs   scaffoldColr   r   rp   O  rq   z!AlignToScaffold.<locals>.<lambda>r+   )r   N)r   )rg   rs   r   r   )rs   r   r   AlignToScaffoldK  s    r   c               C   sB   yt   W n tk
r    Y nX dtkr>tjjtd< ttj_dS )z5 Monkey patch an RDKit method of Chem.Mol and pandas zChem.Mol.__ge__N)r2   ZpatchPandasr3   _originalSettingsr   r\   __ge__r   r   r   r   r   InstallPandasToolsT  s    r   c               C   sB   yt   W n tk
r    Y nX dtkr6td tj_t  dS )z0 Unpatch an RDKit method of Chem.Mol and pandas zChem.Mol.__ge__N)r2   ZunpatchPandasr3   r   r   r\   r   clearr   r   r   r   UninstallPandasTools_  s    r   c             C   s4   dd l }|j|j|j | d\}}|r0t| d S )Nr   )Zoptionflagsverbose)doctestZtestmodELLIPSISZNORMALIZE_WHITESPACEsysexit)r   r   Zfailedr   r   r   r   _runDoctestsn  s
    r   __main__c              C   sX   ddl } y
tj}W n tk
r.   tjj}Y nX | d|d d}tdd |D S )z# Get the pandas version as a tuple r   Nz[^0-9,.]r|   c             s   s   | ]}t |V  qd S )N)r   )r$   Zvir   r   r   r?     s    z$_getPandasVersion.<locals>.<genexpr>)rerO   __version__AttributeErrorversionsplittuple)r   rf   r   r   r   _getPandasVersion  s    
r   c               @   sD   e Zd Zeedkpedkddd Zeedkddd ZdS )TestCaseNzpandas/xlsxwriter not installedc             C   s:   dd l }ddlm} |j|jd}t|}t|d d S )Nr   )RDConfigzNCI/first_200.props.sdfzfoo.xlsx)osrdkitr   pathjoinZ	RDDataDirr[   r   )selfr   r   ZsdfFilerg   r   r   r   testGithub1507  s
    zTestCase.testGithub1507zpandas not installedc             C   s:   t ddgddgd}t|dd | t|jd d	S )
z& problem with update to pandas v1.2.0 ZethanolZfuranZCCOZc1ccoc1)r   ro   ro   molecule   N)rO   rP   rt   ZassertEquallenr   )r   rZ   r   r   r   testGithub3701  s    zTestCase.testGithub3701)	__name__
__module____qualname__unittestZskipIfr   rO   r   r   r   r   r   r   r     s    r   )r      z4pandas installation >=0.19 not found, skipping tests)T)r5   r6   FTNFTT)FF)rm   r6   F)Nru   )r6   NNF)r6   )r6   r(   F)r6   r   )r6   N)r6   r   F)r6   r   )N)H__doc__base64r   r   ZloggingZnumpyrv   r   r   r   r`   r   r   r   r   Zrdkit.Chem.Scaffoldsr	   r
   r   r   Zrdkit.Chem.Draw.IPythonConsoleImportErrorior   Zxml.domr   Z	getLoggerr   r   r   r   r,   r.   Z
molJustifyr   r   r    r(   r2   rL   r   r4   ZpandasrO   r[   rh   r/   Zrdkit.Avalonrk   r   rM   rt   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ZrunTestsr3   mainr   r   r   r   <module>|   s   



  
2
-	


9


:



	


	


