B
    ž3Rc5×  ã               @   sl  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlZyddlmZmZ W n ek
r   dZY nX d	Ze ¡ ad
Zdxdd„Zdd„ Zdydd„Zdzdd„Zd{dd„Zd|dd„Zd}dd„Zdd„ Zd~dd„Zddd„Z d€d d!„Z!dd#d$„Z"d‚d%d&„Z#d'd(„ Z$dƒd)d*„Z%d+d,„ Z&d„d-d.„Z'd/d0„ Z(e)d1krhe%ƒ Z*e(e*ƒZ+e'd	d2 g Z,e*j-re*j.re+d Z/ed3ƒ e	e*j.e/ƒZ0e0j1d4d5e*j- d6Z2x†e2D ]VZ3e3d Z3ye, 4e 5e6e3ƒ¡¡ W n. e7k
r
   ddl8Z8e8 9¡  ed7ƒ Y nX q¸W n&ed8ƒ e:e+d d9ƒZ;e, 4e <e;¡¡ e=e,ƒsXed:ƒ e >d;¡ ned<e=e,ƒ ƒ e+d	d… Z+xðe+D ]æZ?e*j.d=kr¨e?e*_@e*jAe*jBejCd>ZDn
e Ee?¡ZDeD F¡ ZGe=e,ƒZHdgeH ZIdgeH ZJed?ƒ eKeLeD M¡ ƒƒZNeNZOx$eLeHƒD ]ZPeDZQe,eP ZRed"d	d@ y
eRjSZTW n eUk
r<   Y nX e VeT¡ e*jWsXe*jXrve*jWZYd	ZZej[eQe*jWe*dA ndBZZdBZYe\eRdCƒr°eRj]r°eYs°edDƒ edEƒ edDƒ e\eRdFƒrâeRj^râeZsâedDƒ edGƒ edDƒ eeRe*eQd	dH\ZOZNeeNeQeRe*j_dIeIeP< eNeJeP< qþW xDe*j`D ]8Zae=e*j`ƒd	krNedJƒ edKea ƒ e beHejc¡Zde beHejc¡Zee beHejc¡Zfe beHejc¡Zge beHejc¡Zhe beHejc¡Zie*jjdkr¼e beHejc¡Zki Zli Zmi ZndZox~eLeHƒD ]pZPe,eP ZReR peG¡ eJeP ZNeIeP Zqe*jrs4eeReNeQe*jseae*jteqdL\ZuZvZwZxZyZzZ{n0eR |¡ rPe=eR |¡ ƒd	 Z}neR ~¡ d	 d; Z}g Zg Z€e\e*dMƒr|e*jse\e*dNƒr–e*j‚r–g ZƒndZƒeeNeQeRe}eae*jteqee€eƒe*j_dO\ZuZvZwZxZyZzZ{eodkräe be{j„ejc¡Zoe*j…rÈxleD ]d\Z†Z‡ZˆZ‰eNe‰ ZŠe\e*dPƒr@e*jjdkr@e†e*jjkrTem ‹eŠd¡d	 emeŠ< nem ‹eŠd¡d	 emeŠ< qòW xle€D ]d\Z†Z‡ZˆZ‰eNe‰ ZŠe\e*dPƒr®e*jjdkr®e†e*jjkrÂen ‹eŠd¡d	 eneŠ< nen ‹eŠd¡d	 eneŠ< q`W e\e*dMƒr>e*jr>x`eƒD ]X\Z†Z‡ZˆZ‰eNe‰ ZŠe*jjdkr$e†e*jjkr8el ‹eŠd¡d	 eleŠ< nel ‹eŠd¡d	 eleŠ< qâW e*jjd;kr\ee{e*jjdQekeP< eoe{7 Zoe*jrr–e\e*dNƒr–e*j‚r–e#e*eNeQeƒee}d	dH e\e*dMƒ	re*j	reŒdRƒ eŒdSƒ xpeƒD ]h\ZZŽZZ‰eQeNe‰  ZeR |¡ sed; ekst‘dTe6ed; ƒe6eƒf ƒ‚eŒdUe6ed ƒeŽef ƒ qÀW xpeD ]h\ZZŽZZ‰eQeNe‰  ZeR |¡ s~ed; eks~t‘dTe6ed; ƒe6eƒf ƒ‚eŒdVe6ed ƒeŽef ƒ q2W xpe€D ]h\ZZŽZZ‰eQeNe‰  ZeR |¡ sðed; eksðt‘dTe6ed; ƒe6eƒf ƒ‚eŒdWe6ed ƒeŽef ƒ q¤W eŒdXƒ euedeP< eveeeP< ewefeP< exegeP< eyeheP< ezeieP< qØW eŒƒ  eHd	kreŒdYƒ eŒdZƒ e’eeƒeH Z“e ”e’eee“ d[ ƒeHd	  ¡Z•e –ee¡d Z—e’edƒeH Z˜e ”e’ede˜ d[ ƒeHd	  ¡Z™e’efƒeH Zše ”e’efeš d[ ƒeHd	  ¡Z›e’ehƒeH Zœe ”e’eheœ d[ ƒeHd	  ¡Ze’egƒeH Zže ”e’egež d[ ƒeHd	  ¡ZŸe’eiƒeH Z e ”e’eie  d[ ƒeHd	  ¡Z¡e˜e“ Z¢e¢eš Z£eŒd\d]e“ e£ d]e• e£ e“e•e£f ƒ ešdkreŒd^d]e“ e¢ d]e• e¢ e“e•e¢f ƒ eŒƒ  eŒd_d]eš e£ d]e› e£ eše›f ƒ eŒƒ  eŒd`ƒ eŒdad]ež d]eŸ f ƒ eŒdbd]eœ d]e f ƒ ešdkrheŒdcd]e  d]e¡ f ƒ e*jrr eddƒ e ¤eo¡eH Zoe=eoƒZ¥e ’eod¡Z¦e ’eod	¡Z§eŒƒ  xˆeLe¥ƒD ]|Z¨e§e¨ dkrÐd	e§e¨< eoe¨ Z©eded	d@ x*eLe¥ƒD ]Zªe©eª Z«edfe« d	d@ qîW edgdheoe¨e¨f  e§e¨   ƒ q¶W eded	d@ xeLe¥ƒD ]Z¨edid	d@ qLW ed=ƒ eded	d@ xLeLe¥ƒD ]@Z¨e¦e¨ dkršd	e¦e¨< edfdheoe¨e¨f  e¦e¨   d	d@ q€W ed=ƒ e*jjd;kr e’ekƒeH Z¬eke¬8 Zke ”e’ekek ƒ¡eHd	  Z­edje*jje¬e­f ƒ ndZ—eŒdkƒ eŒdle—d	 ƒ eee— Z®ede— Z¯efe— Z°e¯e® Z¢e¢e° Z£eŒdmd]e® e£ e®e£f ƒ e°dkrºeŒdnd]e® e¢ e®e¢f ƒ eŒƒ  eŒdod]e° e£ e°f ƒ eŒƒ  eŒd`ƒ eŒdpd]ege—   ƒ eŒdqd]ehe—   ƒ e°dkreŒdrd]eie—   ƒ eHd	kr‚e*jrr‚ed=ƒ eddƒ e ¤e{¡Zoe=e{ƒZ¥e ’eod¡Z¦e ’eod	¡Z§ed=ƒ xˆeLe¥ƒD ]|Z¨e§e¨ dkr†d	e§e¨< eoe¨ Z©eded	d@ x*eLe¥ƒD ]Zªe©eª Z«edfe« d	d@ q¤W edgdheoe¨e¨f  e§e¨   ƒ qlW eded	d@ xeLe¥ƒD ]Z¨edid	d@ qW ed=ƒ eded	d@ xLeLe¥ƒD ]@Z¨e¦e¨ dkrPd	e¦e¨< edfdheoe¨e¨f  e¦e¨   d	d@ q6W ed=ƒ e*j…rúedsƒ em ±¡ Z²e=e²ƒr¬edtƒ en ±¡ Z²e=e²ƒrúeduƒ x2e²D ]*Z³eDe³ Zedve6ed ƒene³ f ƒ qÌW e\e*dMƒr$e*jr$el ±¡ Z²e=e²ƒr$edwƒ x2e²D ]*Z³eDe³ Zedve6ed ƒele³ f ƒ q.W q$W q|W dS )…a(   command line utility for screening composite models

**Usage**

  _ScreenComposite [optional args] modelfile(s) datafile_

Unless indicated otherwise (via command line arguments), _modelfile_ is
a file containing a pickled composite model and _filename_ is a QDAT file.

**Command Line Arguments**

  - -t *threshold value(s)*: use high-confidence predictions for the final
     analysis of the hold-out data.  The threshold value can be either a single
     float or a list/tuple of floats.  All thresholds should be between
     0.0 and 1.0

  - -D: do a detailed screen.

  - -d *database name*: instead of reading the data from a QDAT file,
     pull it from a database.  In this case, the _datafile_ argument
     provides the name of the database table containing the data set.

  - -N *note*: use all models from the database which have this note.
               The modelfile argument should contain the name of the table
               with the models.

  - -H: screen only the hold out set (works only if a version of
        BuildComposite more recent than 1.2.2 was used).

  - -T: screen only the training set (works only if a version of
        BuildComposite more recent than 1.2.2 was used).

  - -E: do a detailed Error analysis.  This shows each misclassified
     point and the number of times it was missed across all screened
     composites.  If the --enrich argument is also provided, only compounds
     that have true activity value equal to the enrichment value will be
     used.

  - --enrich *enrichVal*: target "active" value to be used in calculating
     enrichments.

  - -A: show All predictions.

  - -S: shuffle activity values before screening

  - -R: randomize activity values before screening

  - -F *filter frac*: filters the data before training to change the
     distribution of activity values in the training set.  *filter frac*
     is the fraction of the training set that should have the target value.
     **See note in BuildComposite help about data filtering**

  - -v *filter value*: filters the data before training to change the
     distribution of activity values in the training set. *filter value*
     is the target value to use in filtering.
     **See note in BuildComposite help about data filtering**

  - -V: be verbose when screening multiple models

  - -h: show this message and exit

  - --OOB: Do out an "out-of-bag" generalization error estimate.  This only
      makes sense when applied to the original data set.

  - --pickleCol *colId*: index of the column containing a pickled value
      (used primarily for cases where fingerprints are used as descriptors)

  *** Options for making Prediction (Hanneke) Plots ***

  - --predPlot=<fileName>: triggers the generation of a Hanneke plot and
      sets the name of the .txt file which will hold the output data.
      A Gnuplot control file, <fileName>.gnu, will also be generated.

  - --predActTable=<name> (optional):  name of the database table
      containing activity values.  If this is not provided, activities
      will be read from the same table containing the screening data

  - --predActCol=<name> (optional):  name of the activity column. If not
      provided, the name of the last column in the activity table will
      be used.

  - --predLogScale (optional):  If provided, the x axis of the
      prediction plot (the activity axis) will be plotted using a log
      scale

  - --predShow: launch a gnuplot instance and display the prediction
      plot (the plot will still be written to disk).

  *** The following options are likely obsolete ***

  - -P: read pickled data.  The datafile argument should contain
     a pickled data set. *relevant only to qdat files*

  - -q: data are not quantized (the composite should take care of
     quantization itself if it requires quantized data). *relevant only to
     qdat files*



é    N)ÚDataStructs)ÚDbModule)Ú	DbConnect)ÚCompositeRun)Ú	DataUtilsÚ	SplitData)ÚImageÚ	ImageDrawé   z3.3.0c             C   s*   |rt j d|  ¡ nt j d|  ¡ dS )z¥ emits messages to _sys.stdout_
    override this in modules which import this one to redirect output

    **Arguments**

      - msg: the string to be displayed

  z%s z%s
N)ÚsysÚstdoutÚwrite)ÚmsgÚnoRet© r   ú7lib/python3.7/site-packages/rdkit/ML/ScreenComposite.pyÚmessageŠ   s    	r   c             C   s   t j d|  ¡ dS )z¥ emits messages to _sys.stderr_
    override this in modules which import this one to redirect output

    **Arguments**

      - msg: the string to be displayed

  z
ERROR: %s
N)r   Ústderrr   )r   r   r   r   Úerror™   s    	r   c             C   s„   |dk s|| j d krdS ttt| ƒƒƒ}tt| d d …|f ƒƒ}|rx| ||f | }tt| |d d …f ƒƒ}|| }ndS || S )Nr   g        )ÚshapeÚfloatÚsum)ZmatÚtgtÚnPtsZnTgtPredZ
pctCorrectZnTgtRealZ
pctOverallr   r   r   ÚCalcEnrichment¥   s    
r   c             C   s2  xbt t|ƒƒD ]R}| |¡}t|dƒrt|jƒtkri }t|dƒrZx|jD ]}	d||	< qJW ||_qW t| ƒ}
dg|
 }x¶t |
ƒD ]ª}| | }|| }|rÖg }x<t t|ƒƒD ]&}| |¡}|j |d¡sª| |¡ qªW nd}|j	|||d\}}| 
¡ r| |¡d }n|d }|||f||< |r€||ƒ q€W |S )ad   screens a set of examples through a composite and returns the
  results
#DOC

  **Arguments**

    - examples: the examples to be screened (a sequence of sequences)
       it's assumed that the last element in each example is it's "value"

    - composite:  the composite model to be used

    - callback: (optional)  if provided, this should be a function
      taking a single argument that is called after each example is
      screened with the number of examples screened so far as the
      argument.

    - appendExamples: (optional)  this value is passed on to the
      composite's _ClassifyExample()_ method.

    - errorEstimate: (optional) calculate the "out of bag" error
      estimate for the composite using Breiman's definition.  This
      only makes sense when screening the original data set!
      [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
      Statistics Technical Report (1996)]

  **Returns**

    a list of 3-tuples _nExamples_ long:

      1)  answer: the value from the example

      2)  pred: the composite model's prediction

      3)  conf: the confidence of the composite

  Ú_trainIndicesr
   Nr   )ZappendExampleZ
onlyModelséÿÿÿÿ)ÚrangeÚlenZGetModelÚhasattrÚtyper   ÚdictÚgetÚappendZClassifyExampleÚGetActivityQuantBoundsZQuantizeActivity)ÚindicesZdataSetÚ	compositeÚcallbackÚappendExamplesÚerrorEstimateÚjZtmpZtisÚvr   ÚresÚiÚidxZexampleZuseZmdlÚpredÚconfÚanswerr   r   r   ÚCollectResults³   s8    (





r2   c             C   s²   |dkrt | ||||	|
d}|dkr(g }|dkr4g }|dkr@g }xltt|ƒƒD ]\}|| \}}}||kr˜||kr„| ||||f¡ qª| ||||f¡ qN| ||||f¡ qNW dS )a°   screens a set of examples cross a composite and breaks the
      predictions into *correct*,*incorrect* and *unclassified* sets.
#DOC
  **Arguments**

    - examples: the examples to be screened (a sequence of sequences)
       it's assumed that the last element in each example is its "value"

    - composite:  the composite model to be used

    - threshold: (optional) the threshold to be used to decide whether
      or not a given prediction should be kept

    - screenResults: (optional) the results of screening the results
      (a sequence of 3-tuples in the format returned by
      _CollectResults()_).  If this is provided, the examples will not
      be screened again.

    - goodVotes,badVotes,noVotes: (optional)  if provided these should
      be lists (or anything supporting an _append()_ method) which
      will be used to pass the screening results back.

    - callback: (optional)  if provided, this should be a function
      taking a single argument that is called after each example is
      screened with the number of examples screened so far as the
      argument.

    - appendExamples: (optional)  this value is passed on to the
      composite's _ClassifyExample()_ method.

    - errorEstimate: (optional) calculate the "out of bag" error
      estimate for the composite using Breiman's definition.  This
      only makes sense when screening the original data set!
      [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
      Statistics Technical Report (1996)]

  **Notes**

    - since this function doesn't return anything, if one or more of
      the arguments _goodVotes_, _badVotes_, and _noVotes_ is not
      provided, there's not much reason to call it

  N)r'   r(   r)   )r2   r   r   r#   )r%   Údatar&   Ú	thresholdÚscreenResultsÚ	goodVotesÚbadVotesÚnoVotesr'   r(   r)   r-   r1   r/   r0   r   r   r   ÚDetailedScreený   s     -

r9   c       %      C   sl  t | ƒ}|	dkrg }	|
dkr g }
|dkr,g }t| |||||	|
||||d t |
ƒ}t |	ƒ}|| }|ržtdƒ td||dt|ƒ | ||dt|ƒ | f ƒ t |ƒ}|dkrö|rÐtd||dt|ƒ | f ƒ t dd	„ |D ƒ¡}t|ƒt|ƒ }nd
}|dkr*t dd	„ |
D ƒ¡}t|ƒt|ƒ }nd
}|dkrldd	„ |	D ƒ}t dd	„ |	D ƒ¡}t|ƒt|ƒ }ng }g }d
}|rœtƒ  td| ƒ td| ƒ t ||ftj¡}x"|D ]}|||f  d7  < q´W x*|
D ]"\}}}}|||f  d7  < qØW |rZtƒ  tdƒ | 	¡ }t |d¡}t |d¡} t
dƒ xˆt|ƒD ]|}!| |! dkr^d| |!< ||! }"t
ddd x*t|ƒD ]}#|"|# }$t
d|$ dd q|W t
dd||!|!f  | |!   ƒ qDW t
ddd xt|ƒD ]}!t
ddd qÚW t
dƒ t
ddd xLt|ƒD ]@}!||! dkr(d||!< t
dd||!|!f  ||!   dd qW t
dƒ |||||||fS )aÓ   screens the results and shows a detailed workup

  The work of doing the screening and processing the results is
  handled by _DetailedScreen()_
#DOC

  **Arguments**

    - examples: the examples to be screened (a sequence of sequences)
       it's assumed that the last element in each example is its "value"

    - composite:  the composite model to be used

    - nResultCodes: the number of possible results the composite can
      return

    - threshold: the threshold to be used to decide whether or not a
      given prediction should be kept

    - screenResults: (optional) the results of screening the results
      (a sequence of 3-tuples in the format returned by
      _CollectResults()_).  If this is provided, the examples will not
      be screened again.

    - callback: (optional)  if provided, this should be a function
      taking a single argument that is called after each example is
      screened with the number of examples screened so far as the
      argument.

    - appendExamples: (optional)  this value is passed on to the
      composite's _ClassifyExample()_ method.

    - goodVotes,badVotes,noVotes: (optional)  if provided these should
      be lists (or anything supporting an _append()_ method) which
      will be used to pass the screening results back.

    - errorEstimate: (optional) calculate the "out of bag" error
      estimate for the composite using Breiman's definition.  This
      only makes sense when screening the original data set!
      [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
      Statistics Technical Report (1996)]

  **Returns**

    a 7-tuple:

      1) the number of good (correct) predictions

      2) the number of bad (incorrect) predictions

      3) the number of predictions skipped due to the _threshold_

      4) the average confidence in the good predictions

      5) the average confidence in the bad predictions

      6) the average confidence in the skipped predictions

      7) the results table

  N)r5   r6   r7   r8   r'   r(   r)   z
	*** Vote Results ***z.misclassified: %d/%d (%%%4.2f)	%d/%d (%%%4.2f)g      Y@r   zskipped: %d/%d (%%% 4.2f)c             S   s   g | ]}|d  ‘qS )é   r   )Ú.0Úxr   r   r   ú
<listcomp>”  s    z#ShowVoteResults.<locals>.<listcomp>g        c             S   s   g | ]}|d  ‘qS )r:   r   )r;   r<   r   r   r   r=   š  s    c             S   s   g | ]}|d  ‘qS )r
   r   )r;   r<   r   r   r   r=      s    c             S   s   g | ]}|d  ‘qS )r:   r   )r;   r<   r   r   r   r=   ¡  s    z$average correct confidence:   % 6.4fz$average incorrect confidence: % 6.4fr
   z	Results Table:Ú z    )r   z % 6dz     | % 4.2fz-------z % 6.2f)r   r9   Úprintr   ÚnumpyZarrayr   ÚzerosÚintÚ	transposer   r   )%r%   r3   r&   ÚnResultCodesr4   Úverboser5   r'   r(   r6   r7   r8   r)   Ú	nExamplesÚnBadÚnGoodÚnClassifiedÚnSkipZnoConfÚavgSkipZbadConfÚavgBadZgoodResZgoodConfÚavgGoodÚvoteTabr,   Úansr0   r.   ZvTabÚ	colCountsÚ	rowCountsr-   Úrowr*   Úentryr   r   r   ÚShowVoteResults>  s    @



&*rT   ç        c
          
   C   s  |dkrg }|dkrg }|	dkr$g }	|s,d}t ||| |||||	d t|ƒ}
d}x|D ]\}}}}||7 }qVW t|ƒ}d}x|D ]\}}}}||7 }q€W t|	ƒ}d}d}d}x6|	D ].\}}}}||7 }||krØ|d7 }q²|d7 }q²W |
| | }|rþtd|ƒ |r„|| }|rÊtd|dt|ƒ | f ƒ td	|dt|ƒ | f ƒ td
|dt|ƒ | f ƒ td|dt|ƒ | f ƒ nF|rÊtd|dt|ƒ | f ƒ td|||   ƒ td||  ƒ d}d}d}|
rä||
 }|rò|| }|r || }|
|||||dfS )aÝ   screens a set of data using a composite model and prints out
             statistics about the screen.
#DOC
    The work of doing the screening and processing the results is
    handled by _DetailedScreen()_

  **Arguments**

    - composite:  the composite model to be used

    - data: the examples to be screened (a sequence of sequences)
       it's assumed that the last element in each example is its "value"

    - partialVote: (optional) toggles use of the threshold value in
      the screnning.

    - voteTol: (optional) the threshold to be used to decide whether or not a
      given prediction should be kept

    - verbose: (optional) sets degree of verbosity of the screening

    - screenResults: (optional) the results of screening the results
      (a sequence of 3-tuples in the format returned by
      _CollectResults()_).  If this is provided, the examples will not
      be screened again.

    - goodVotes,badVotes,noVotes: (optional)  if provided these should
      be lists (or anything supporting an _append()_ method) which
      will be used to pass the screening results back.


  **Returns**

    a 7-tuple:

      1) the number of good (correct) predictions

      2) the number of bad (incorrect) predictions

      3) the number of predictions skipped due to the _threshold_

      4) the average confidence in the good predictions

      5) the average confidence in the bad predictions

      6) the average confidence in the skipped predictions

      7) None

  Ng        )r5   r6   r7   r8   r   r
   zTotal N Points:z Misclassifications: %d (%%%4.2f)g      Y@zN Skipped: %d (%%%4.2f)z!	Good Votes Skipped: %d (%%%4.2f)z 	Bad Votes Skipped: %d (%%%4.2f)z)Average Correct Vote Confidence:   % 6.4fz)Average InCorrect Vote Confidence: % 6.4f)r9   r   r?   r   )r&   r%   r3   ÚpartialVoteÚvoteTolrE   r5   r6   r7   r8   rH   Z	goodAccumr,   r/   r0   r.   ZmisCountZbadAccumZnSkippedZgoodSkippedZ
badSkippedZ	skipAccumrO   ZnDataZnCountedrM   rL   rK   r   r   r   ÚScreenItÑ  sh    4

rX   c             C   s>   x8t t| ƒƒD ](}| | \}}}}||||| f| |< qW dS )aq   *Internal Use Only*

  converts a list of 4 tuples: (answer,prediction,confidence,idx) into
  an alternate list: (answer,prediction,confidence,data point)

   **Arguments**

     - votes: a list of 4 tuples: (answer, prediction, confidence,
       index)

     - data: a _DataUtils.MLData.MLDataSet_


   **Note**: alterations are done in place in the _votes_ list

  N)r   r   )Zvotesr3   r-   rO   r/   r0   r.   r   r   r   Ú_processVoteListD  s    rY   c       	      C   s°  t |dƒr|js t |dƒrð|jrðy
| j}W n tk
r>   Y nbX |rPtddd t |dƒrˆ|jrˆt |dƒrˆ|jrˆtdƒ tdƒ tdƒ tj| 	¡ |dd	\}}t |d
ƒrî|j
dkrî|rÄtddd tj||j|j
d|dd\}}||7 }|}nšt |dƒrv|jrvt |d
ƒr`|j
dkr`|r.tddd tj||j|j
dt| 	¡ ƒdd\}}| |¡ ntt| 	¡ ƒƒ}g }ntt| 	¡ ƒƒ}g }t |dƒr¨|jr¨|| }}||fS )NÚ	doHoldoutÚ
doTrainingÚsr
   )r   r)   z'*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*zL******  WARNING: OOB screening should not be combined with doHoldout option.)ZsilentÚ
filterFracg        Úfr   )ZindicesToUseZindicesOnly)r   rZ   r[   Z
_splitFracÚAttributeErrorr   r)   r   ZSplitIndicesÚGetNPtsr]   r   Z
FilterDataÚ	filterValr   ÚextendÚlist)	ÚmodelÚdetailsr3   rE   ZsplitFÚtrainIdxÚtestIdxZ	trainFiltZtempr   r   r   ÚPrepareDataFromDetailsZ  sF    

rh   c
       6      C   sˆ  |dkr.t |dƒr&|j|jtjd}n| ¡ }|jdkr@d|_nd|_t| ƒtt	gkr\| f} t
| ƒ}
|dk	r|||
| ¡  ƒ t |
tj¡}t |
tj¡}t |
tj¡}t |
tj¡}t |
tj¡}t |
tj¡}d}|dkràg }|dkrìg }|dkrøg }|	dkrdg|
 }	i }i }x"t|
ƒD ]}|
dkr<g }g }g }| | }y
|j}W n tk
rd   Y nX t |¡ t |dƒr„|js˜t |dƒrÌ|jrÌt |dƒr²|jr²d	}nd
}d	}tj|||d nd
}d
}t |dƒr|jr|stdƒ tdƒ tdƒ t |dƒr8|jr8|s8tdƒ tdƒ tdƒ t|||ƒ\}}| ¡ d }|rr||| ¡  fdd„}nd}t |dƒrŠ|jsd}nd}t||||d |jd||||||d\}}} }!}"}#}$|dkrât |$jtj¡}t |dƒrÒ|jrÒxl|D ]d\}%}&}'}(||( })t |dƒrJ|j dkrJ|%|j kr^| !|)d¡d ||)< n| !|)d¡d ||)< qüW xl|D ]d\}%}&}'}(||( })t |dƒr¸|j dkr¸|%|j krÌ| !|)d¡d ||)< n| !|)d¡d ||)< qjW ||$7 }|||< |||< | ||< |!||< |"||< |#||< t |dƒr |j dkr t"|$|j d|	|< q W |
dkrV||| |!|"|#|$fS ||
 }t#|ƒ|
 }*t $t#||* d ƒ|
d  ¡}+t#|ƒ|
 },t $t#||, d ƒ|
d  ¡}-t#|ƒ|
 }.t $t#||. d ƒ|
d  ¡}/t#|ƒ|
 }0t $t#||0 d ƒ|
d  ¡}1t#|ƒ|
 }2t $t#||2 d ƒ|
d  ¡}3t#|ƒ|
 }4t $t#||4 d ƒ|
d  ¡}5|,|-f|*|+f|.|/f|2|3f|0|1f|4|5f|fS dS )aK    Screens a set of data using a a _CompositeRun.CompositeRun_
       instance to provide parameters

# DOC

    The actual data to be used are extracted from the database and
    table specified in _details_

    Aside from dataset construction,  _ShowVoteResults()_ does most of
    the heavy lifting here.

  **Arguments**

      - model: a composite model

      - details:  a _CompositeRun.CompositeRun_ object containing details
        (options, parameters, etc.) about the run

      - callback: (optional)  if provided, this should be a function
        taking a single argument that is called after each example is
        screened with the number of examples screened so far as the
        argument.

      - setup: (optional) a function taking a single argument which is
        called at the start of screening with the number of points to
        be screened as the argument.

      - appendExamples: (optional)  this value is passed on to the
        composite's _ClassifyExample()_ method.

      - goodVotes,badVotes,noVotes: (optional)  if provided these should
        be lists (or anything supporting an _append()_ method) which
        will be used to pass the screening results back.


  **Returns**

    a 7-tuple:

      1) the number of good (correct) predictions

      2) the number of bad (incorrect) predictions

      3) the number of predictions skipped due to the _threshold_

      4) the average confidence in the good predictions

      5) the average confidence in the bad predictions

      6) the average confidence in the skipped predictions

      7) the results table

  NÚ	pickleCol)ri   ÚpickleClassg        r
   r   ÚshuffleActivitiesÚrandomActivitiesTF)ÚshuffleÚ
runDetailsÚ_shuffleActivitiesz'*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*zD******  WARNING: Shuffled model being screened with unshuffled data.Ú_randomizeActivitieszB******  WARNING: Random model being screened with non-random data.c             S   s   || | ƒS )Nr   )r<   ÚyÚzr   r   r   Ú<lambda>  ó    z#ScreenFromDetails.<locals>.<lambda>r)   r   )rE   r'   r(   r6   r7   r8   r)   ÚerrorAnalysisÚ	enrichTgt)r   r:   )%r   Ú
GetDataSetri   r   ÚExplicitBitVectr4   rV   r    rc   Útupler   r`   r@   rA   r   r   Ú_randomSeedr_   r   ÚInitRandomNumbersrk   rl   ÚRandomizeActivitiesro   r   rp   rh   ÚGetQuantBoundsr)   rT   r   ru   rv   r"   r   r   Úsqrt)6Úmodelsre   r'   Zsetupr(   r6   r7   r8   r3   ÚenrichmentsÚnModelsrH   rG   rJ   ÚconfGoodÚconfBadÚconfSkiprN   ÚbadVoteDictÚ
noVoteDictr-   rd   Úseedrm   Ú	randomizerf   rg   Z	nPossibleÚcbr)   ÚgÚbr\   ÚaGÚaBÚaSÚvTÚaÚpÚcr.   ÚlabelÚavgNBadÚdevNBadÚavgNGoodÚdevNGoodÚavgNSkipÚdevNSkipÚ
avgConfBadÚ
devConfBadÚavgConfGoodÚdevConfGoodÚavgConfSkipÚdevConfSkipr   r   r   ÚScreenFromDetails‡  sè    8








r    c             C   sX  t sd S yt| ƒt|ƒ t|ƒ }W nD tk
rh   | d } |d }|d }t| ƒt|ƒ t|ƒ }Y nX |srd S d}d}d}t| ƒ| }t|ƒ| }	t|ƒ| }
|d kr®d}t d|d¡}t |¡}dd|d d |d d f}d	}t||d
  ƒ}|j||||d |}t||	d
  ƒ}|j||||d |}t||
d
  ƒ}|j||||d |S )Nr   )éd   r¡   éÿ   )r¢   r¡   r¡   )r¢   r¢   r¡   )r¡   r¡   ZRGB)r¢   r¢   r¢   r
   i¦ÿÿÿih  )Zfill)	ÚhasPilr   Ú	TypeErrorr   Únewr	   ZDrawrB   Zpieslice)rH   rG   ÚnRejÚsizeZnTotZ	goodColorZbadColorZrejColorZpctGoodZpctBadÚpctRejÚimgZdrawZboxZstartPZendPr   r   r   ÚGetScreenImageT  s@    
rª   Ú.c          	   C   s8  t | ƒtkrd}nd}|r,dg}| d¡ ng }| d¡ |	s~t| ||ƒ}|r~|rbd |df¡}nd}| |¡ | d| ¡ t|ƒ}t |d¡}t |d¡}t 	|tj
¡}t 	|tj
¡}xXt|ƒD ]L}|| rìt
|||f ƒ||  ||< || rÄt
|||f ƒ||  ||< qÄW | d	¡ | d
¡ x t|ƒD ]}| d| ¡ q2W | d¡ | d¡ xât|ƒD ]Ö}| d| ¡ x†t|ƒD ]z}||krÈ|s°| d|||f  ¡ n| d|||f  ¡ n4|sæ| d|||f  ¡ n| d|||f  ¡ q„W | dd||   ¡ |dkr2| d| ¡ n
| d¡ qhW | d¡ x(t|ƒD ]}| dd||   ¡ qVW | d¡ | d| ¡ | d¡ |sŒ||  | }||  }|rÊdt
|ƒ | }nd}| d|| | |f ¡ |dkr@dt
|ƒ | | |  }| d|||f ¡ dt
|ƒ | }| d|||f ¡ | dkrX| d| ¡ |dkrp| d| ¡ |dkr | d | ¡ nt|d | d  |d  }|d | d  }|d | d  }|rôdt
|d ƒ | }dt
|d ƒ | }nd}d}| d!|d |d ||||f ¡ |dkr¸dt
|d ƒ | }dt
|d ƒ | }| d"|d |d |||f ¡ dt
|d ƒ | }dt
|d ƒ | }| d#|d |d |||f ¡ | dkrÐ| d$| ¡ |dkrè| d%| ¡ |dkr | d&| ¡ | d'¡ |
rd(}| |¡ |r.| d)¡ d* |¡S )+a   returns the text of a web page showing the screening details
#DOC
    **Arguments**

     - nGood: number of correct predictions

     - nBad:  number of incorrect predictions

     - nRej:  number of rejected predictions

     - avgGood: average correct confidence

     - avgBad:  average incorrect confidence

     - avgSkip: average rejected confidence

     - voteTable: vote table

     - imgDir: (optional) the directory to be used to hold the vote
       image (if constructed)

   **Returns**

     a string containing HTML

  r
   r   z<html><body>z&<center><h2>VOTE DETAILS</h2></center>z<font>ú/z	votes.pngz<center><img src="%s"></center>z<center><table border=1>z<tr><td></td>z<th>%d</th>z<th>% Accurate</th>z</tr>z<tr><th>%d</th>z<td bgcolor="#A0A0FF">%d</td>z<td bgcolor="#A0A0FF">%.2f</td>z<td>%d</td>z<td>%.2f</td>z<td>%4.2f</td</tr>g      Y@z"<th rowspan=%d>Predicted</th></tr>z<tr><th>% Pure</th>z<td>%4.2f</td>z)<tr><td></td><th colspan=%d>Original</th>z</table></center>g        z1<p>%d of %d examples were misclassified (%%%4.2f)z.<p>                %d of %d overall: (%%%4.2f)z,<p>%d of %d examples were rejected (%%%4.2f)zG<p>The correctly classified examples had an average confidence of %6.4fzI<p>The incorrectly classified examples had an average confidence of %6.4fz;<p>The rejected examples had an average confidence of %6.4fzH<p>%.2f(%.2f) of %.2f(%.2f) examples were misclassified (%%%4.2f(%4.2f))z=<p>                %.2f(%.2f) of %d overall: (%%%4.2f(%4.2f))z;<p>%.2f(%.2f) of %d examples were rejected (%%%4.2f(%4.2f))zM<p>The correctly classified examples had an average confidence of %6.4f(%.4f)zO<p>The incorrectly classified examples had an average confidence of %6.4f(%.4f)zA<p>The rejected examples had an average confidence of %6.4f(%.4f)z</font>a	  
    <p><b>Definitions:</b>
    <ul>
    <li> <i>% Pure:</i>  The percentage of, for example, known positives predicted to be positive.
    <li> <i>% Accurate:</i>  The percentage of, for example, predicted positives that actually
      are positive.
    </ul>
    z</body></html>Ú
)r    ry   r#   rª   ÚjoinZsaver   r@   r   rA   r   r   )rH   rG   r¦   rM   rL   rK   Z	voteTableZimgDirZfullPageZskipImgZincludeDefsZ
multModelsZoutTxtr©   ZimgFileNameZnPossZ
pureCountsZ	accCountsZpureVectZaccVectr-   r*   ZnTotalZnClassZpctErrr¨   ZdevClassZ	devPctErrZ	devPctRejZtxtr   r   r   ÚScreenToHtml|  sÖ    

 





















r¯   c       #         s®  t | dƒr| jsdS |r tdƒ t| jdƒ}td| j dƒ}	‡ ‡fdd„|D ƒ}
t| j| j| j| jd}| 	¡ }|ˆ }t | d	ƒr”| j
r”| j
| jkrš|}nt| j| j
| j| jd}|r¾td
ƒ t|
d ƒtdƒtdƒgkrèdd„ |
D ƒ}
tjgt|
ƒ }t | dƒr| jr| j}n| 	¡ d }d|d |¡f }|jd||f ||
d}|rXtdƒ dgt|
ƒ }x(|D ] }|\}}|
 |¡}|||< qlW | d| ¡ xP|D ]H\}}}}|| }|dkrÌt|ƒ}nd}| d|
| |||f ¡ q¤W xP|D ]H\}}}}|| }|dkrt|ƒ}nd}| d|
| |||f ¡ qöW | ¡  t | dƒr^| jsd|}nd| }| dd¡}dt|f }|	 |¡ g }xPt|ƒD ]D}t | dƒr´| jsÊ| d| j|f ¡ n| d| j|f ¡ qœW |	 dd |¡ ¡ d }|	 |¡ |	 ¡  t | d!ƒrª| jrªy`ydd"lm}  W n tk
rR   td#ƒ‚Y nX | ƒ }!|!d$t ¡  ƒ |!d%| j ƒ t d&ƒ W n& t!k
r¨   ddl"}"|" #¡  Y nX dS )'a¥  

  **Arguments**

    - details:  a CompositeRun.RunDetails object

    - indices: a sequence of integer indices into _data_

    - data: the data set in question.  We assume that the ids for
      the data points are in the _idCol_ column

    - goodVotes/badVotes: predictions where the model was correct/incorrect.
      These are sequences of 4-tuples:
        (answer,prediction,confidence,index into _indices_)

  ÚpredPlotNz*
-> Constructing Prediction (Hanneke) Plotzw+z%s.gnuc                s   g | ]}ˆ | ˆ ‘qS r   r   )r;   r<   )r3   ÚidColr   r   r=   =  s    z MakePredPlot.<locals>.<listcomp>)ÚuserZpasswordÚpredActTablez	-> Pulling Activity Datar   r>   c             S   s   g | ]}t |ƒ‘qS r   )Ústr)r;   r<   r   r   r   r=   P  s    Ú
predActColr   z
%s in (%s)ú,z%s,%s)ÚfieldsÚwhereÚextrasz	-> Creating Plotz#ID Pred Conf %s
ÚNonez%s %d %.4f %f
ÚpredLogScalezlog(%s)Ú_ú zð# Generated by ScreenComposite.py version: %s
  set size square 0.7
  set yrange [:1]
  set data styl points
  set ylab 'confidence'
  set xlab '%s'
  set grid
  set nokey
  set term postscript enh color solid "Helvetica" 16
  set term X
  z'%s' us 4:($2==%d?$3:0/0)z#'%s' us (log10($4)):($2==%d?$3:0/0)zplot %s
z
  # EOF
  ÚpredShow)ÚGnuplotz)Functionality requires the Gnuplot modulezcd "%s"zload "%s.gnu"zpress return to continue...
)$r   r°   r   Úopenr   ÚdbNameÚ	tableNameZdbUserZ
dbPasswordZGetColumnNamesr³   r    r   ZplaceHolderr   rµ   r®   ÚGetDataÚindexr   r   Úcloser»   ÚreplaceÚ__VERSION_STRINGr   r#   r¾   r¿   ÚImportErrorÚosÚgetcwdÚinputÚ	ExceptionÚ	tracebackÚ	print_exc)#re   r%   r3   r6   r7   ÚnResr±   rE   ZoutFZgnuFZptIdsZorigConnZcolNamesZidNameZactConnZwhereLZ
actColNameZwhereTxtZrawDZactsrS   ZIDZactr.   rO   r/   r0   ZactLabelZgnuHdrZplotsr-   ZgnuTailr¿   r‘   rÍ   r   )r3   r±   r   ÚMakePredPlot$  s˜    






 

 

rÐ   c             C   s   d S )Nr   )re   r   r   r   ÚGo  s    rÑ   c             C   sF   | d krt } t | ¡ dg| _d| _d| _d| _d| _d| _d| _	| S )Ng        r   )
Ú_detailsr   ÚSetDefaultsÚscreenVoteTolÚdetailedScreenrZ   r[   ru   rE   rV   )re   r   r   r   rÓ   ¡  s    
rÓ   c               C   s   t tƒ t d¡ dS )z\ prints a list of arguments for when this is used from the
  command line and then exits

  r   N)r?   Ú__doc__r   Úexitr   r   r   r   ÚUsage°  s    rØ   c             C   s,   t dt ƒ | r(t dƒ t d tj¡ƒ dS )z- prints the version number of the program

  z%This is ScreenComposite.py version %szcommand line was:r½   N)r?   rÇ   r®   r   Úargv)ÚincludeArgsr   r   r   ÚShowVersion¹  s    rÛ   c       
      C   sö  dd l }y0|  tjdd … ddddddd	d
dg¡\}}W n* tk
rb   dd l}| ¡  tƒ  Y nX d| _d| _d| _	d| _
d| _d| _d| _d| _xF|D ]<\}}|dkr¶|| _qœ|dkrÆd| _qœ|dkr<d| _t|ƒ}t|ƒtg ƒtdƒgk rþ|g}x4|D ],}|dks|dk rtdƒ t d¡ qW || _qœ|dkrN|| _qœ|dkrfd| _d| _qœ|dkr~d| _d| _qœ|dkr–d| _d| _qœ|dkr®d| _d| _qœ|dkrÀd| _qœ|dkrÒd| _qœ|dkrätƒ  qœ|dkrút|ƒ| _qœ|dkrt|ƒ| _ qœ|dkr d}	qœ|dkr8d| _|| _qœ|d krJ|| _qœ|d!kr\|| _	qœ|d"krnd| _
qœ|d#kr€d| _qœ|d#kr’d| _qœ|d$kr¤d| _qœ|d%kr¾t!|ƒd | _qœ|d&krÔt!|ƒ| _qœtƒ  qœW t"|ƒdk ròtƒ  |S )'Nr   r
   zEDd:t:VN:HThSRF:v:AXz	predPlot=zpredActCol=zpredActTable=r»   r¾   ZOOBz
pickleCol=zenrich=r>   r   z-dz-Dz-t)r
   r
   z(Voting threshold must be between 0 and 1éþÿÿÿz-Nz-Hz-Tz-Ez-Az-Sz-Rz-hz-Fz-vz-Vz
--predPlotz--predActColz--predActTablez--predLogScalez
--predShowz--OOBz--pickleColz--enrich)#Úgetoptr   rÙ   rÌ   rÍ   rÎ   rØ   r°   rµ   r³   r»   r¾   r)   ri   rv   rÁ   rÕ   rV   Úevalr    r   r×   rÔ   Únoter[   rZ   ru   ÚshowAllrk   rl   r   r]   ra   rB   r   )
re   rÝ   Úargsr¹   rÍ   ÚargÚvalrW   ÚtolrE   r   r   r   Ú	ParseArgsÃ  s¬    






















rå   Ú__main__)rÚ   z"-> Retrieving models from databaserd   zwhere note='%s')r·   r¸   zModel load failedz-> Loading modelÚrbzNo composite models foundr   z-> Working with %d models.r>   )ri   rj   z'-> Constructing and screening data sets)r   )rm   rn   Fro   z'*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*zD******  WARNING: Shuffled model being screened with unshuffled data.rp   zB******  WARNING: Random model being screened with non-random data.)rE   )r)   zC
-----*****-----*****-----*****-----*****-----*****-----*****-----
zTolerance: %f)rE   r5   rà   r°   )rE   r5   r7   r8   r6   r)   rv   )r   z1-v-v-v-v-v-v-v-    All Votes      -v-v-v-v-v-v-v-z>id, prediction, confidence, flag(-1=skipped,0=wrong,1=correct)zbad point?: %s != %sz%s, %d, %.4f, 1z%s, %d, %.4f, 0z%s, %d, %.4f, -1z -^-^-^-^-^-^-^-  -^-^-^-^-^-^-^-z0-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*z	AVERAGES:r:   z9Misclassifications: 	%%%5.2f(%%%5.2f)   %4.1f(%4.1f) / %dr¡   z1	threshold: 	%%%5.2f(%%%5.2f)   %4.1f(%4.1f) / %dz0Number Skipped: %%%4.2f(%%%4.2f)    %4.2f(%4.2f)zConfidences:z	Correct: 	%4.2f(%4.2f)z	Incorrect: 	%4.2f(%4.2f)z	Skipped: 	%4.2f(%4.2f)zResults Table:z    z % 6.2fz     | % 4.2fg      Y@z-------z&   Enrichment of value %d: %.4f (%.4f)z0------------------------------------------------zBest Model: z&Misclassifications: 	%%%5.2f   %d / %dz	threshold: 	%%%5.2f   %d / %dzNumber Skipped: %%%4.2f    %dz	Correct: 	%4.2fz	Incorrect: 	%4.2fz	Skipped: 	%4.2fz2
*-*-*-*-*-*-*-*- ERROR ANALYSIS -*-*-*-*-*-*-*-*
z ---> Bad Vote Countsz ---> Skipped Compound Countsz%s,%dz ---> Good Vote Counts)r   )r
   )Nr   r   )r   NNNNNr   r   )r
   NNr   NNNr   )r   rU   r
   NNNN)r   )NNr   NNNNN)N)r«   r
   r   r
   )r   r   )N)r   )´rÖ   rÉ   r   r@   Zrdkitr   Zrdkit.Dbaser   Zrdkit.Dbase.DbConnectionr   Zrdkit.MLr   Zrdkit.ML.Datar   r   ÚpickleZPILr   r	   rÈ   r£   rÒ   rÇ   r   r   r   r2   r9   rT   rX   rY   rh   r    rª   r¯   rÐ   rÑ   rÓ   rØ   rÛ   rå   Ú__name__re   r¹   r   rß   rÁ   ZtblNameZconnrÃ   ZblobsZblobr#   Úloadsr´   rÌ   rÍ   rÎ   rÀ   Z	modelFileÚloadr   r×   ZfNamerÂ   rw   ri   rx   r3   ZBuildDataSetZGetVarNamesZ	descNamesr   r5   ZdataSetsrc   r   r`   rg   rf   ZmodelIdxZtmpDrd   rz   r‡   r_   r{   rk   rl   rm   rˆ   r|   r   ro   rp   r)   rÔ   rä   rA   r   rH   rG   rJ   r‚   rƒ   r„   rv   r€   ZgoodVoteDictr…   r†   rN   ZSetInputOrderZ	screenResrÕ   rV   rE   rŠ   r‹   r\   rŒ   r   rŽ   r   r$   rÏ   r}   r7   r8   rà   r°   r6   r   ru   r   r‘   r’   r.   r“   r"   r?   rO   r/   r0   ZptÚAssertionErrorr   r”   r~   r•   ZargsortZbestIdxr–   r—   r˜   r™   rš   r›   rœ   r   rž   rŸ   rI   rF   rC   rD   rP   rQ   r-   rR   r*   rS   ZmeanZdevZbestBadZbestGoodZbestSkipÚkeysZksÚkr   r   r   r   Ú<module>n   sš  



J 
@  
  
r
- 
 M
( 
 (
y
	

\











   
"

 
&*




&*


"

