B
    3RcM                 @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ ddlZe ZdZd	Zd
d Zd'ddZdd Zd(ddZd)ddZdd Zd*ddZdd Zedkreejdk re  deje_ee ee ed	d eeZ ee Z!e!d	krxe"e!D ]bZ#ej$%de#d	 e!f  eee e# d	dZ&ej'rxej(rxeee&Z)ne&gZ)xe)D ]Z*e*+  qW ej,reej-Z.e.d	krej,Z,e)d /e, nPxNe"e.D ]BZ#e0d ej-e#  Z1e)e# Z2d!ej,3d"d e1f Z,e2/e, qW ej4r,ej5r,ed#ej5ej4f  eej-d	krTed$ x8e"ee)D ](Z#e6e)e# e_2ej7ej5ej4d% qbW q,W nVe!d	kreee d d	dZ&ej'rej(reee&Z)ne&gZ)xe)D ]Z*e*+  qW ej,rpeej-Z.e.d	kr ej,Z,e)d /e, nPxNe"e.D ]BZ#e0d ej-e#  Z1e)e# Z2d!ej,3d"d e1f Z,e2/e, q*W ej4rej5red#ej5ej4f  ee)d	kred$ xBe"ee)D ](Z#e6e)e# e_2ej7ej5ej4d% qW ned& dS )+a   command line utility for growing composite models

**Usage**

  _GrowComposite [optional args] filename_

**Command Line Arguments**

  - -n *count*: number of new models to build

  - -C *pickle file name*:  name of file containing composite upon which to build.

  - --inNote *note*: note to be used in loading composite models from the database
      for growing

  - --balTable *table name*:  table from which to take the original data set
     (for balancing)

  - --balWeight *weight*: (between 0 and 1) weighting factor for the new data
     (for balancing). OR, *weight* can be a list of weights

  - --balCnt *count*: number of individual models in the balanced composite
     (for balancing)

  - --balH: use only the holdout set from the original data set in the balancing
     (for balancing)

  - --balT: use only the training set from the original data set in the balancing
     (for balancing)

  - -S: shuffle the original data set
     (for balancing)

  - -r: randomize the activities of the original data set
     (for balancing)

  - -N *note*: note to be attached to the grown composite when it's saved in the
     database

  - --outNote *note*: equivalent to -N

  - -o *filename*: name of an output file to hold the pickled composite after
     it has been grown.
     If multiple balance weights are used, the weights will be added to
     the filenames.

  - -L *limit*: provide an (integer) limit on individual model complexity

  - -d *database name*: instead of reading the data from a QDAT file,
     pull it from a database.  In this case, the _filename_ argument
     provides the name of the database table containing the data set.

  - -p *tablename*: store persistence data in the database
     in table *tablename*

  - -l: locks the random number generator to give consistent sets
     of training and hold-out data.  This is primarily intended
     for testing purposes.

  - -g: be less greedy when training the models.

  - -G *number*: force trees to be rooted at descriptor *number*.

  - -D: show a detailed breakdown of the composite model performance
     across the training and, when appropriate, hold-out sets.

  - -t *threshold value*: use high-confidence predictions for the final
     analysis of the hold-out data.

  - -q *list string*:  Add QuantTrees to the composite and use the list
     specified in *list string* as the number of target quantization
     bounds for each descriptor.  Don't forget to include 0's at the
     beginning and end of *list string* for the name and value fields.
     For example, if there are 4 descriptors and you want 2 quant bounds
     apiece, you would use _-q "[0,2,2,2,2,0]"_.
     Two special cases:
       1) If you would like to ignore a descriptor in the model building,
          use '-1' for its number of quant bounds.
       2) If you have integer valued data that should not be quantized
          further, enter 0 for that descriptor.

  - -V: print the version number and exit

    N)	DbConnect)CompositeRun)ScreenCompositeBuildComposite)AdjustComposite)	DataUtils	SplitDataz0.5.0   c             C   s   t rtjd|   dS )z emits messages to _sys.stdout_
    override this in modules which import this one to redirect output

    **Arguments**

      - msg: the string to be displayed

  z%s
N)_verbosesysstdoutwrite)msg r   5lib/python3.7/site-packages/rdkit/ML/GrowComposite.pymessaget   s    	r   c       )      C   s  t  | _|dkr|| j }| jdkr0|d | _| jdkrFt|}n6| j	g kr`|| _| 
 }ntj| j|| j| j| jd}|j}t| | jdkrtj|d| d n| jdkrtj|d| d | }|}	t|	}
td|
  td	t|	d d
   | }|j}ttd|d }| jrddlm}m} | j	g krRddlm} |j }nddlm!} |j"}|j#}|j}|r|$|%  |j&|	|dg| ||| j'| j(| j)d|| j	| j*| j+|t, d n0ddl-m} |j#}|j&|	|dg| | j'|dd |.  |/  |0 \}}}t12|}t12|}|% |_3x&tt|D ]}|| 4|j3 q<W || }t5|t5| }|| }|| }t16|| }t5|t5| }t,rtdd| d| f  | j7r|j8|	dd g }| j9s@t,rtd t:;|||}t,r*tdt|dt<t| t<t| f  t<t|t| t=_>| j9rt,rVtd t?@t|A |||d | jB}|\} }!}"}#}$}"}%t|}&| |! }'t<|!|' t=_>|#t=_C|$t=_DtE|%t=_F|'|& }(|(dkrt<|(|& t=_G|S )a   does the actual work of building a composite model

    **Arguments**

      - details:  a _CompositeRun.CompositeRun_ object containing details
        (options, parameters, etc.) about the run

      - composite: the composite model to grow

      - progressCallback: (optional) a function which is called with a single
        argument (the number of models built so far) after each model is built.

      - saveIt: (optional) if this is nonzero, the resulting model will be pickled
        and dumped to the filename specified in _details.outName_

      - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
        will be called using the results of the data set's _GetVarNames()_ method;
        it is assumed that the details object has a _descNames attribute which
        is passed to the composites _SetDescriptorNames()_ method.  Otherwise
        (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.

      - data: (optional) the data set to be used.  If this is not provided, the
        data set described in details will be used.

    **Returns**

      the enlarged composite model


  N z.pkl)Z	quantNameuserZpasswordr	   )shuffle
runDetailsr   zTraining with %d examplesz	%d descriptors   )CrossValidate	PruneTree)BuildQuantTree)ID3)buildDriverprunernTriespruneIt
lessGreedyneedsQuantizationZtreeBuilderZnQuantBoundsstartAtZmaxDepthprogressCallbacksilent)r   )r   r   r    z># Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2fg      Y@)verbosezTesting all examplesz)%d examples (%% %5.2f) were misclassifiedz
Entire data set:)HtimeZasctimeZrundate	tableNamestripoutNamedbNamer   ZBuildQuantDataSetqBounds
GetDataSetZDBToQuantDataZ
qTableNameZdbUserZ
dbPassword_randomSeedInitRandomNumbersshuffleActivitiesRandomizeActivitiesrandomActivitiesGetNamedDatalenr   ZGetNVarsnPossibleValslistrangeuseTreesZrdkit.ML.DecTreer   r   r   ZQuantTreeBootr   ZID3BootZCrossValidationDriverZSetInputOrderGetVarNamesZGrownModelsr   r   r!   
limitDepthr
   Zrdkit.ML.NeuralZAverageErrorsZ
SortModelsZ
GetAllDatanumpyZarrayZ	_varNamesZ	NameModelsumZsqrtZ
bayesModelZTraindetailedResr   Ztestallfloat_runDetailsZoverall_errorr   ZShowVoteResultsZGetNPts	thresholdZoverall_correct_confZoverall_incorrect_confreprZoverall_result_matrixZoverall_fraction_dropped))details	compositer"   ZsaveItsetDescNamesdatafNameZseednamedExamplestrainExamplesZ	nExamplesZnVarsr4   Zattrsr   r   r   Zbuilderr   Zdriverr   Z	modelListZcountsZavgErrsiZweightedErrsZ
averageErrZdevsZavgDevZbadExamplesZwrongZresTupZnGoodZnBad_ZavgGoodZavgBadZvoteTabZnPtsZnClassZnRejr   r   r   GrowIt   s    














(

rK   c             C   s   g }| j r^| jr^t| j| j }|jdd| j d}xH|D ] }|d }|tt| q8W n| j	r||t
t| j	d |S )NZMODELzwhere note='%s')Zfieldswherer   rb)persistTblNameinNoter   r*   ZGetDataappendpickleloadsstrcomposFileNameloadopen)rB   resZconnZmdlsrowZrawDr   r   r   GetComposites  s    
rY   c                s  | j r| j t|kr|S td |dkrhtd | j }| j}|| _| j}| j| _|  }|| _|| _|dkrt|S |j	| _
|j| _t| j | jdkrtj|d| d n| jdkrtj|d| d |  | js| jrtjt | j
dd\}} fdd	|D  fd
d	|D }	| jdkrntj| j| jddd\}}
fdd	|D }|	fdd	|
D 7 }	|| jr|	 }	n }dd	 | D }d}|dkrtd |  }|dkr|S |j	| _
|j| _t| j | jdkrtj|d| d n| jdkrtj|d| d | }dd	 | D }d}g }| j}t|ttfs\|f}x:|D ]2}td|  |t j!||||| j ||d qbW |S )a   balances the composite using the parameters provided in details

   **Arguments**

     - details a _CompositeRun.RunDetails_ object

     - composite: the composite model to be balanced

     - data1: (optional) if provided, this should be the
       data set used to construct the original models

     - data2: (optional) if provided, this should be the
       data set used to construct the new individual models

  zBalancing CompositeNz	Reading First Data Setr	   )r   r   r   )r#   c                s   g | ]} | qS r   r   ).0x)rG   r   r   
<listcomp>I  s    z$BalanceComposite.<locals>.<listcomp>c                s   g | ]} | qS r   r   )rZ   r[   )rG   r   r   r\   J  s    g        r%   )ZindicesOnlyc                s   g | ]} | qS r   r   )rZ   r[   )rH   r   r   r\   N  s    c                s   g | ]} | qS r   r   )rZ   r[   )rH   r   r   r\   O  s    c             S   s   g | ]}|  qS r   )upper)rZ   r[   r   r   r   r\   V  s    z	Reading Second Data Setc             S   s   g | ]}|  qS r   )r]   )rZ   r[   r   r   r   r\   i  s    z	Balancing with Weight: %.4f)Znames1Znames2)"balCntr3   r   balTabler(   r'   r*   balDbr,   Z
_splitFracZ	splitFracr-   Z
randomSeedr   r.   r/   r0   r1   r2   balDoHoldout
balDoTrainr   ZSplitIndicesZ
filterFracZ
FilterDataZ	filterValr8   	balWeight
isinstancetupler5   rP   r   BalanceComposite)rB   rC   Zdata1Zdata2rF   Ztmpr*   ZtrainIdxZtestIdxZtestExamplesZtempZdataSet1Zcols1ZdataSet2Zcols2rW   Zweightsweightr   )rG   rH   r   rf     s    





rf   c             C   s,   t dt  | r(t d t dtj dS )z prints the version number

  z#This is GrowComposite.py version %szcommand line was: N)print__VERSION_STRINGjoinr   argv)includeArgsr   r   r   ShowVersiony  s    rn   c               C   s   t t td dS )zM provides a list of arguments for when this is used from the command line

  r%   N)ri   __doc__r   exitr   r   r   r   Usage  s    rq   c             C   s   | dkrt } t| S )a    initializes a details object with default values

      **Arguments**

        - details:  (optional) a _CompositeRun.CompositeRun_ object.
          If this is not provided, the global _runDetails will be used.

      **Returns**

        the initialized _CompositeRun_ object.


  N)r?   r   SetDefaults)r   r   r   r   rr     s    rr   c             C   s  ddl }| tjdd dddddd	d
ddg\}}d| _d| _d| _d| _d| _d| _d| _	d| _
x|D ]\}}|dkrt|| _ql|dkr|| _ql|dkr|| _ql|dkrt|| _t| jttfs| jf| _ql|dkrt|| _ql|dk rd| _ql|dkrd| _	ql|dkr"|| _
ql|dkr4|| _ql|dksH|dkrP|| _ql|dkrb|| _ql|dkrt|| _ql|dkrd| _ql|dkrd| _ql|dkrt  ql|dkrd| _ql|d krd| _ql|d!krt|| _ql|d"kr|| _ql|d#krd| _ql|d$krt|| _ql|d%kr0d| _ql|d&krFt|| _ql|d'kr~t|}t|ttfspt d(|| _!|| _"ql|d)krt|}t#|t#g t#d*gkst d+|| _$|| _%ql|d,krt&  t'd qlt(d-|tj)d. t  qlW |d | _*| j
s| j| _
dS )/z parses command line arguments and updates _runDetails_

      **Arguments**

        - runDetails:  a _CompositeRun.CompositeRun_ object.

  r   Nr	   z)P:o:n:p:b:sf:F:v:hlgd:rSTt:Q:q:DVG:L:C:N:zinNote=zoutNote=z	balTable=z
balWeight=zbalCnt=ZbalHZbalTzbalDb=r   )g      ?z-nz-Cz
--balTablez--balWeightz--balCntz--balHz--balTz--balDbz--inNotez-Nz	--outNotez-oz-pz-rz-Sz-hz-lz-gz-Gz-dz-Tz-tz-Dz-Lz-qz4bad argument type for -q, specify a list as a stringz-Qr   z4bad argument type for -Q, specify a list as a stringz-Vzbad argument:)file)+getoptr   rl   rO   rT   r_   rc   r^   ra   rb   r`   intr9   evalrd   re   r5   Znoter)   rN   r1   r/   rq   Z
lockRandomr   r!   r*   r7   r>   r@   r=   r:   AssertionErrorZqBoundCountr+   typeZactivityBoundsZactivityBoundsValsrn   rp   ri   stderrr'   )r   rt   argsZextraargvalr+   r   r   r   	ParseArgs  s    






















r}   __main__r   rh   )rm   zT---------------------------------
	Doing %d of %d
---------------------------------
)rD   d   z	%s.%d.pklz.pklzUpdating results table %s:%szDWARNING: updating results table with models having different weights)ZdbtablezNo models found)Nr	   r   N)NN)r   )N)8ro   r   r&   r;   Zrdkit.Dbase.DbConnectionr   Zrdkit.MLr   r   r   Zrdkit.ML.Compositer   Zrdkit.ML.Datar   r   rQ   r?   rj   r
   r   rK   rY   rf   rn   rq   rr   r}   __name__r3   rl   rk   cmdZ
initModelsr9   r6   rI   ry   r   rC   r_   r^   Z
compositesZmdlZClearModelExamplesr)   rc   ZnWeightsZPickleru   rg   ZmodelsplitrN   r*   dumpsZStorer   r   r   r   <module>^   s   
 
^


b





"



