B
    ž3Rcp?  ã            
   @   sæ  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ ddlZdZd	d
„ Zdejfdd„Zejfdd„Zdgg dfdd„ZdTdd„ZdUdd„Zdd„ ZedkrâddlZy e ejdd… dd¡\ZZW n, ek
r   ddlZe ¡  eƒ  Y nX e	 	¡ Z e	 !e ¡ dge _"de _#de _$de _%de _&de _'de _(de _)de _*de _+de _,de _-x¾eD ]´\Z.Z/e.dkr–e/e _0e.dkrªe/e _%q|e.dks¾e.d krèe1e/ƒe _"e2e j"e3e4fƒs0e j"fe _"q|e.d!kr e5e/ƒe _6q|e.d"kre/e _7q|e.d#kr(e/e _8q|e.d$kr:de _9nöe.d%krRde _#de _$nÞe.d&krjde _#de _$nÆe.d'kr€e5e/ƒe _:n°e.d(kr–e5e/ƒe _;nše.d)kr¨e/e _&nˆe.d*krºde _'nve.d+krÔe<e/ƒd e _(n\e.d,kræde _)nJe.d-krøde _*n8e.d.kr
de _,n&e.d/kr e<e/ƒe _-ne.d0kr|e/e _+q|W e j0rFe j%sTeƒ  e=d1ƒ ed2ƒ ej>e j0e j%ej?ej@e j(ejAd3ZBeB C¡ ZDeB E¡ ZFed4eF ƒ e GeFd5fej5¡ZHe GeFejI¡ZJdgeF ZKg ZLe j8rØee j0e j8ƒZMed6ƒ eM N¡ ZOeO Pd7e j8e j7f ¡ ed8ƒ yeO Q¡ ZRW n ek
r2   dZRY nX xœeRrÐed9eSeLƒ ƒ eRd ZRyeL Te UeVeRƒ¡¡ W n. ek
rš   ddlZe ¡  e=d:ƒ Y n
X ed;ƒ yeO Q¡ ZRW n ek
rÊ   dZRY nX q6W dZOnbx`eD ]XZWye XeYeWd<ƒ¡ZZW n2 ek
r(   ddlZe=d=eW ƒ e ¡  Y nX eL TeZ¡ qÞW eSeLƒZ[i Z\d>geSeLƒ Z]x@e^eLƒD ]2\Z_ZZeBZ`y
eZjaZbW n eck
rŽ   Y nX e deb¡ e j9r°ejee`dd? efeZd@ƒr0e j$sÌe j#r0ejge` E¡ eZjhddA\ZiZje j:dBkrejke`e j;e j:deiddC\ZlZmejem7 ZjelZie j#r@eiej ZjZine4ene` E¡ ƒƒZjedDeSejƒ ƒ eeZeDe`e j"eje j)dE\ZoZpedFƒ eepe j*e j6dGZqe j+r¨eYdHe j+e_d f dIƒZrndZrxÔe^eqƒD ]È\ZsZteqes Ztetd eKes< eHes d  etd 7  < eHes d  etd5 7  < e\ uesg ¡Zvev Tetd ¡ eve\es< eJes  d7  < errLer wdJetd  ¡ etd eod5 kr¶etd5 e]e_ k r¶etd5 e]e_< q¶W edKe]e_  ƒ q`W e j&r¸ee eHeJe\e[eodL ne[dkrÌe=dMƒ ne=dNƒ dZsxêeseFk rÂeJes dkrÂe[dkrpe xe\es ¡\ZyZzej{ezeSe\es ƒdOdPZ|e=dQesd eHes d eJes  e|eHes d eJes  eJes eVeKes ƒf ƒ nFe=dResd eHes d eJes  eHes d eJes  eJes eVeKes ƒf ƒ esd7 ZsqÚW e xe]¡\ZyZze=dSeyezf ƒ dS )Va÷  Command line tool to construct an enrichment plot from saved composite models

Usage:  EnrichPlot [optional args] -d dbname -t tablename <models>

Required Arguments:
  -d "dbName": the name of the database for screening

  -t "tablename": provide the name of the table with the data to be screened

  <models>: file name(s) of pickled composite model(s).
     If the -p argument is also provided (see below), this argument is ignored.

Optional Arguments:
  - -a "list": the list of result codes to be considered active.  This will be
        eval'ed, so be sure that it evaluates as a list or sequence of
        integers. For example, -a "[1,2]" will consider activity values 1 and 2
        to be active

  - --enrich "list": identical to the -a argument above.

  - --thresh: sets a threshold for the plot.  If the confidence falls below
          this value, picking will be terminated

  - -H: screen only the hold out set (works only if a version of
        BuildComposite more recent than 1.2.2 was used).

  - -T: screen only the training set (works only if a version of
        BuildComposite more recent than 1.2.2 was used).

  - -S: shuffle activity values before screening

  - -R: randomize activity values before screening

  - -F *filter frac*: filters the data before training to change the
     distribution of activity values in the training set.  *filter frac*
     is the fraction of the training set that should have the target value.
     **See note in BuildComposite help about data filtering**

  - -v *filter value*: filters the data before training to change the
     distribution of activity values in the training set. *filter value*
     is the target value to use in filtering.
     **See note in BuildComposite help about data filtering**

  - -p "tableName": provides the name of a db table containing the
      models to be screened.  If you use this argument, you should also
      use the -N argument (below) to specify a note value.

  - -N "note": provides a note to be used to pull models from a db table.

  - --plotFile "filename": writes the data to an output text file (filename.dat)
    and creates a gnuplot input file (filename.gnu) to plot it

  - --showPlot: causes the gnuplot plot constructed using --plotFile to be
    displayed in gnuplot.

é    N)ÚDataStructs)ÚRDConfig)Ú	DbConnect)ÚCompositeRun)Ú	DataUtilsÚ	SplitDataÚStatsz2.4.0c             C   s   | |k d p| |kd S )Néÿÿÿÿé   © )Zt1Zt2r   r   ú2lib/python3.7/site-packages/rdkit/ML/EnrichPlot.pyÚcmpU   s    r   c             C   s&   |r|  d|  ¡ n|  d|  ¡ dS )z­ emits messages to _sys.stderr_
      override this in modules which import this one to redirect output

      **Arguments**

        - msg: the string to be displayed

    z%s z%s
N)Úwrite)ÚmsgZnoRetÚdestr   r   r   ÚmessageY   s    	r   c             C   s   t j d|  ¡ dS )z­ emits messages to _sys.stderr_
      override this in modules which import this one to redirect output

      **Arguments**

        - msg: the string to be displayed

    z
ERROR: %s
N)ÚsysÚstderrr   )r   r   r   r   r   Úerrorh   s    	r   r
   c             C   s`  |   |¡ x`tt| ƒƒD ]P}|  |¡}t|dƒrt|jtƒsi }t|dƒrbx|jD ]}	d||	< qRW ||_qW g }
|  ¡ r~d}nd}|s–t	tt|ƒƒƒ}d}x¼|D ]´}|ræg }x<tt| ƒƒD ]&}|  |¡}|j 
|d¡sº| |¡ qºW nd}|| }| j||d\}}|r|  |dd… ¡}|d }||kr6|d7 }||kr |
 |d |||f¡ q W ||
fS )aÖ   collects the results of screening an individual composite model that match
      a particular value

     **Arguments**

       - mdl: the composite model

       - descs: a list of descriptor names corresponding to the data set

       - data: the data set, a list of points to be screened.

       - picking: (Optional) a list of values that are to be collected.
         For examples, if you want an enrichment plot for picking the values
         1 and 2, you'd having picking=[1,2].

      **Returns**

        a list of 4-tuples containing:

           - the id of the point

           - the true result (from the data set)

           - the predicted result

           - the confidence value for the prediction

    Ú_trainIndicesr
   r   N)Z
onlyModelsr	   )ZSetInputOrderÚrangeÚlenZGetModelÚhasattrÚ
isinstancer   ÚdictZGetQuantBoundsÚlistÚgetÚappendZClassifyExampleZQuantizeActivity)ZmdlÚdescsÚdataÚpickingÚindicesÚerrorEstimateÚjZtmpZtisÚvÚresZ
needsQuantÚnTrueActivesÚiZuseZptÚpredÚconfZtrueResr   r   r   ÚScreenModelt   sD    






r*   c             C   sx   |r|   dd„ ¡ g }d}d}xTtt| ƒƒD ]D}| | \}}}	}
|
|kr,|	|krX|d7 }|d7 }| |||f¡ q,W |S )a    Accumulates the data for the enrichment plot for a single model

      **Arguments**

        - predictions: a list of 3-tuples (as returned by _ScreenModels_)

        - thresh: a threshold for the confidence level.  Anything below
          this threshold will not be considered

        - sortIt: toggles sorting on confidence levels


      **Returns**

        - a list of 3-tuples:

          - the id of the active picked here

          - num actives found so far

          - number of picks made so far

    c             S   s   t |d | d ƒS )Né   )r   )ÚxÚyr   r   r   Ú<lambda>Ó   ó    z"AccumulateCounts.<locals>.<lambda>r   r
   )Úsortr   r   r   )ZpredictionsÚthreshÚsortItr%   ZnCorrectÚnPtsr'   ZIDÚrealr(   r)   r   r   r   ÚAccumulateCountsº   s    r5   r	   c             C   s  t | dƒr| jsd S d| j }t|dƒ}d}xÚ|t|ƒk r|| dkr|dkr¾t || ¡\}	}
tj|
t|| ƒdd}| d|d || d ||  || d ||  || |f ¡ n>| d	|d || d ||  || d ||  || f ¡ |d7 }q.W | ¡  d
| j }t|dƒ}dt	 }t
||d |dkrRt
d| |d t
d|d |dkr˜|d }t
d| d|d t
d||f |d nt
d| |d | ¡  t | dƒr| jry*ddlm} |ƒ }|d| ƒ tdƒ W n& tk
r   dd l}| ¡  Y nX d S )NÚplotFilez%s.datzw+r   r
   éZ   )Úlevelz%d %f %f %d %f
z%d %f %f %d
z%s.gnuzõ# Generated by EnrichPlot.py version: %s
  set size square 0.7
  set xr [0:]
  set data styl points
  set ylab 'Num Correct Picks'
  set xlab 'Num Picks'
  set grid
  set nokey
  set term postscript enh color solid "Helvetica" 16
  set term X
  )Úfilezset yr [0:%d]zplot x with linesé   z!replot "%s" using 1:2 with lines,Ú )Úendr9   z)"%s" every %d using 1:2:5 with yerrorbarszreplot "%s" with pointsÚshowPlot)ÚGnuplotz	load "%s"zpress return to continue...
)r   r6   Úopenr   r   Ú
MeanAndDevÚGetConfidenceIntervalr   ÚcloseÚ__VERSION_STRINGÚprintr=   r>   ÚinputÚ	ExceptionÚ	tracebackÚ	print_exc)ÚdetailsÚfinalÚcountsÚ	pickVectsÚnModelsÚ	nTrueActsZdataFileNameZoutFr'   Ú_ÚsdÚconfIntervalZplotFileNameZgnuFZgnuHdrZeveryGapr>   ÚprG   r   r   r   ÚMakePlotâ   sJ    

00



rS   c               C   s   t j t¡ t  d¡ dS )z$ displays a usage message and exits r	   N)r   r   r   Ú__doc__Úexitr   r   r   r   ÚUsage  s    rV   Ú__main__zd:t:a:N:p:cSTHF:v:)
zthresh=z	plotFile=r=   z
pickleCol=ZOOBZnoSortz	pickBase=ÚdoROCz
rocThresh=zenrich=r;   z-dz-tz-az--enrichz--threshz-Nz-pz-Sz-Hz-Tz-Fz-vz
--plotFilez
--showPlotz--pickleColz--OOBz--noSortz--doROCz--rocThreshz
--pickBasez2*******Please provide both the -d and -t argumentszBuilding Data set
)ÚuserZpasswordÚ	pickleColZpickleClassz	npts: %d
é   z"-> Retrieving models from databasez$select model from %s where note='%s'z-> Reconstructing modelsz Building model %dzModel failedz  <-DoneÚrbzproblems with model %s:g    „×—A)ZshuffleÚ
_splitFrac)Zsilentg        )ZindicesToUseZindicesOnlyzscreening %d examples)r    r!   r"   Zaccumulating)r2   r1   z%s.%d.pickszw+z%s
zHalfway point: %d
)rN   zH#Index	Avg_num_correct	Conf90Pct	Avg_num_picked	Num_picks	last_selectionz>#Index	Avg_num_correct	Avg_num_picked	Num_picks	last_selectionr7   )r8   z%d	%f	%f	%f	%d	%sz%d	%f	%f	%d	%szHalfway point: %.2f(%.2f))r   r
   )r	   )}rT   r   ZnumpyZrdkitr   r   Zrdkit.Dbase.DbConnectionr   Zrdkit.MLr   Zrdkit.ML.Datar   r   r   ÚpicklerC   r   r   r   r   r*   r5   rS   rV   Ú__name__ZgetoptÚargvÚargsZextrasrF   rG   rH   rI   ZSetDefaultsZ	activeTgtZ
doTrainingZ	doHoldoutZdbTableNamer6   r=   rZ   r"   r2   ZpickBaserX   Z	rocThreshÚargÚvalZdbNameÚevalr   Útupler   ÚfloatZ	thresholdZnoteZpersistTblNameZshuffleActivitiesZ
filterFracZ	filterValÚintrD   ZDBToDataZdefaultDBUserZdefaultDBPasswordZExplicitBitVectZdataSetZGetVarNamesr   ZGetNPtsr3   ZzerosrJ   ZintegerrK   ZselPtsZmodelsZconnZ	GetCursorZcursZexecuteZfetchoneZblobr   r   ÚloadsÚstrZ	modelNameÚloadr?   ZmodelrM   rL   Z
halfwayPtsÚ	enumerateZ
whichModelZtmpDZ_randomSeedZseedÚAttributeErrorZInitRandomNumbersZRandomizeActivitiesr   ZSplitIndicesr]   ZtrainIdxZtestIdxZ
FilterDataZ	trainFiltZtempr   r&   Z	screenResZrunningCountsZpickFiler'   Úentryr   r$   r   r@   ZmeanrP   rA   rQ   r   r   r   r   Ú<module>B   s~  F
(
6





























$


,.