B
    ž3RcÎ²  ã               @   sR  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dl
mZ d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d dl Z d dl m!Z! d dl"Z#d dl$Z$d dl%T d dlm&Z& dZ'dsdd„Z(G dd„ de)ƒZ*dtdd„Z+dudd„Z,G dd „ d ƒZ-e.d!krNd"d#„ Z/d$Z0ej1e0d%Z2e2j3d&d'd(d)d*d+d, e2j3d-d.d/d0d1d2 e2j3d3d.d4d5d1d2 e2j3d6d.d7d8d1d2 e2j3d9d'd(d:d;d+d, e2j3d<d'd(d=d>d?d, e2j3d@d'dAdBdCdDd, e2j3dEd'd(dFdGd+d, e2j3dHd'd(dIdJd+d, e2j3dKd'd(dLdMd+d, e2j3dNd.dOdOd1d2 e2j3dPd'd(dQdRd+d, e2 4¡ \Z5Z6e5j7 8dS¡Z9e:e9ƒdkrÞe:e9d  ƒd krÞe;dTƒ e;e0ƒ e <dU¡ n\e:e9ƒdVkr:e:e9d  ƒd kr:e:e9d ƒd kr:e= 7e9d  e9d ¡Z>e;dWe> ƒ e <d ¡ e5j? 8dX¡Z@e:e@ƒdkr^e@d  Z?ne@d Z?e5j?d+kr˜e5jAd+kr˜e;dYƒ e;dZƒ e <dU¡ e5jBsÂe5jCsÂe;d[ƒ e;dZƒ e <dU¡ e-e?e5jDd\ZEe5jFeE_Fe5jAd+kre;d]e5jA ƒ eE Ge5jA¡ZHeHsDe/d^ƒ e <dU¡ n,e;d_e? ƒ eE I¡ ZHeHsDe/d`ƒ e <dU¡ eE J¡ ZHeHsde/daƒ e <dU¡ e5jCrŒeE K¡ ZHeHsŒe/dbƒ e <dU¡ eE L¡ ZHeHs¬e/dcƒ e <dU¡ e5jMd+kr.eE Ne5jM 8dS¡¡ZHeHsâe/ddƒ e <dU¡ e;deƒ x8eOe:eEjPƒƒD ]&ZQeE ReQ¡\ZSZTe;dfeQeSeTf ƒ qúW e <d ¡ eE Ue5jV¡ZHeHsRe/dgƒ e <dU¡ eE W¡ ZHeHsre/dhƒ e <dU¡ eE X¡ ZHeHs’e/diƒ e <dU¡ eE Y¡ ZHeHs²e/djƒ e <dU¡ eEjZdkdldm eEjZdndodm xDeOe:eEjPƒƒD ]2ZQdpe?e5jVeQf Z[eE \e[eQ¡ e;dqeQe[f ƒ qÞW x8eOe:eEjPƒƒD ]&ZQeE ReQ¡\ZSZTe;dreQeSeTf ƒ q$W dS )vé    N)ÚChem)ÚAllChem)ÚSDWriter)ÚDescriptors)ÚMoleculeDescriptors)Úinterp)Ústats)Úcross_validation)ÚRandomForestClassifier)Úmetrics)Útrain_test_split)Ú	roc_curveÚauc)Úprecision_scoreÚrecall_score)Úpreprocessing)Ú	Unpickler)Ú*)Úmake_scorerak  %(kind)s Kappa Coefficient
--------------------------------
Kappa %(kappa)6.4f
ASE %(std_kappa)6.4f
%(alpha_ci)s%% Lower Conf Limit %(kappa_low)6.4f
%(alpha_ci)s%% Upper Conf Limit %(kappa_upp)6.4f

Test of H0: %(kind)s Kappa = 0

ASE under H0 %(std_kappa0)6.4f
Z %(z_value)6.4f
One-sided Pr > Z %(pvalue_one_sided)6.4f
Two-sided Pr > |Z| %(pvalue_two_sided)6.4f
é   é   c             C   s@   t t| ƒƒ}t t ||  ¡¡dk r0|d| fS | d|  fS dS )a  helper function for creating result string for int or float

only dec=1 and width=4 is implemented

Parameters
----------
x : int or float
value to format
dec : 1
number of decimals to print if x is not an integer
width : 4
width of string

Returns
-------
xint : int or float
x is converted to int if it is within 1e-14 of an integer
x_string : str
x formatted as string, either '%4d' or '%4.1f'
g›+¡†›„=z%4dz%4.1fN)ÚintÚroundÚnpÚmaxÚabs)ÚxZdecÚwidthZxint© r   ú share/RDKit/Contrib/pzc/p_con.pyÚint_ifclose:   s    r    c               @   s   e Zd Zdd„ Zdd„ ZdS )ÚKappaResultsc             K   sÂ   |   |¡ d| kr*d| d< tdƒd | d< t | d ¡| d< t | d ¡| d	< | d
 | d	  | d< tj | d ¡| d< | d d | d< tj | d ¡| d  }| d
 | | d< | d
 | | d< d S )NZalphagš™™™™™™?g     ÀW@r   Zalpha_ciÚ	var_kappaÚ	std_kappaÚ
var_kappa0Z
std_kappa0ÚkappaZz_valueZpvalue_one_sidedé   Zpvalue_two_sidedZ	kappa_lowZ	kappa_upp)Úupdater    r   Zsqrtr   ZnormZsfZisf)ÚselfÚkwdsZdeltar   r   r   Ú__init__X   s    
zKappaResults.__init__c             C   s   t |  S )N)Úkappa_template)r(   r   r   r   Ú__str__j   s    zKappaResults.__str__N)Ú__name__Ú
__module__Ú__qualname__r*   r,   r   r   r   r   r!   V   s   r!   Tc              C   sò  t  | t¡} t  | ¡ ¡ }|  ¡ }| | }|}t  |¡}|  d¡| }	|  d¡| }
|
|	dd…df  }t  | ¡ d¡s|t‚t  |¡ ¡ }|dkr˜|dkr˜d}|| | d|  }|r°|d|	|
 d|   d  }| ¡ }||
dd…df |	 d  }t  | jd ¡}d|||f< d| d | ¡  }||d|   d }|| | d| d  | }|
|	 |
|	  }||d  | ¡  }|d| d |  }n|dkr²t  | jd ¡}d}t  |t¡}|j	dkrf|dkrt  
|dd…df | ¡|d |d   }n`|d	kr<|dd…df | d |d |d  d  }n(|d
kr\ddlm} ||ƒ}ntdƒ‚n | j\}}| j|jkr†tdƒ‚d||   ¡ | ||  ¡   }|r°t j}t j}d| }|
|  d¡}|	dd…df |  d¡}||
 |	dd…df   ¡ }||||dd…df  d|   d  }dd| d |  }| ¡ ||d|   d  }||9 }|
|	dd…df  }||||dd…df   d   ¡ }||d 8 }||9 }t  |	|
¡ ¡ | d|  }|rêt||||||d}|S |S dS )a‡
  Compute Cohen's kappa with variance and equal-zero test

Parameters
----------
table : array_like, 2-Dim
square array with results of two raters, one rater in rows, second
rater in columns
weights : array_like
The interpretation of weights depends on the wt argument.
If both are None, then the simple kappa is computed.
see wt for the case when wt is not None
If weights is two dimensional, then it is directly used as a weight
matrix. For computing the variance of kappa, the maximum of the
weights is assumed to be smaller or equal to one.
TODO: fix conflicting definitions in the 2-Dim case for
wt : None or string
If wt and weights are None, then the simple kappa is computed.
If wt is given, but weights is None, then the weights are set to
be [0, 1, 2, ..., k].
If weights is a one-dimensional array, then it is used to construct
the weight matrix given the following options.

wt in ['linear', 'ca' or None] : use linear weights, Cicchetti-Allison
actual weights are linear in the score "weights" difference
wt in ['quadratic', 'fc'] : use linear weights, Fleiss-Cohen
actual weights are squared in the score "weights" difference
wt = 'toeplitz' : weight matrix is constructed as a toeplitz matrix
from the one dimensional weights.

return_results : bool
If True (default), then an instance of KappaResults is returned.
If False, then only kappa is computed and returned.

Returns
-------
results or kappa
If return_results is True (default), then a results instance with all
statistics is returned
If return_results is False, then only kappa is calculated and returned.

Notes
-----
There are two conflicting definitions of the weight matrix, Wikipedia
versus SAS manual. However, the computation are invariant to rescaling
of the weights matrix, so there is no difference in the results.

Weights for 'linear' and 'quadratic' are interpreted as scores for the
categories, the weights in the computation are based on the pairwise
difference between the scores.
Weights for 'toeplitz' are a interpreted as weighted distance. The distance
only depends on how many levels apart two entries in the table are but
not on the levels themselves.

example:

weights = '0, 1, 2, 3' and wt is either linear or toeplitz means that the
weighting only depends on the simple distance of levels.

weights = '0, 0, 1, 1' and wt = 'linear' means that the first two levels
are zero distance apart and the same for the last two levels. This is
the sampe as forming two aggregated levels by merging the first two and
the last two levels, respectively.

weights = [0, 1, 2, 3] and wt = 'quadratic' is the same as squaring these
weights and using wt = 'toeplitz'.

References
----------
Wikipedia
SAS Manual

r   r   NZSimpler&   ZWeighted)ZcaZlinearNéÿÿÿÿ)ZfcZ	quadraticÚtoeplitz)r1   zwt option is not knownzweights are not squareg      ð?)Úkindr%   Ú	kappa_maxÚweightsr"   r$   )r   ÚasarrayÚfloatZdiagÚsumZallcloseÚAssertionErrorÚarangeÚshapeÚndimr   Zscipy.linalgr1   Ú
ValueErrorÚnanZminimumr!   ) Útabler4   Zreturn_resultsZwtZagreeZnobsZprobsZfreqsZ
probs_diagZfreq_rowZfreq_colZprob_expZ	agree_expr2   r%   Zterm_aZterm_bZd_idxZterm_cr"   r$   r1   ZrowsZcolsÚwZw_rowZw_colZ
agree_wexpZfacZfreqser3   Zresr   r   r   Úcohens_kappan   s†    I







 ($


r@   c       
      C   s¢   t  | ¡} | j\}}|dkrZt j|  ¡ dd\}}t|ƒ}| | j¡}t  |d ¡d }n*t  |¡r|t  |d ¡d }| }n|}| }t  	||f| ¡}	|	d |fS )aZ  convert raw data with shape (subject, rater) to (rater1, rater2)

    brings data into correct format for cohens_kappa

    Parameters
    ----------
    data : array_like, 2-Dim
        data containing category assignment with subjects in rows and raters
        in columns.
    bins : None, int or tuple of array_like
        If None, then the data is converted to integer categories,
        0,1,2,...,n_cat-1. Because of the relabeling only category levels
        with non-zero counts are included.
        If this is an integer, then the category levels in the data are already
        assumed to be in integers, 0,1,2,...,n_cat-1. In this case, the
        returned array may contain columns with zero count, if no subject
        has been categorized with this level.
        If bins are a tuple of two array_like, then the bins are directly used
        by ``numpy.histogramdd``. This is useful if we want to merge categories.

    Returns
    -------
    arr : nd_array, (n_cat, n_cat)
        Contingency table that contains counts of category level with rater1
        in rows and rater2 in columns.

    Notes
    -----
    no NaN handling, delete rows with missing values

    This works also for more than two raters. In that case the dimension of
    the resulting contingency table is the same as the number of raters
    instead of 2-dimensional.

    NT)Zreturn_inverser   g      à?r   )
r   r5   r:   ÚuniqueZravelÚlenZreshaper9   ZisscalarZhistogramdd)
ÚdataZbinsZn_rowsZn_colsZcat_uniZcat_intZn_catZdata_Zbins_Zttr   r   r   Úto_table  s    %


rD   c               @   sž   e Zd ZdZdi fdd„Zdd„ Zdd„ Zd	d
„ Zdd„ Zdd„ Z	d'dd„Z
dd„ Zdd„ Zdd„ Zd(dd„Zdd„ Zd)dd„Zd*d!d"„Zd#d$„ Zd%d&„ ZdS )+Úp_conznClass to create Models to classify Molecules active or inactive
    using threshold for value in training-dataNc             C   s(   ||dœ| _ || _|| _g | _d| _dS )z8Constructor to initialize Object, use proxy if necessary)Úacc_idÚproxyFN)Úrequest_datarF   rG   ÚmodelÚverbous)r(   rF   rG   r   r   r   r*   Q  s
    zp_con.__init__c             C   sF   t | jƒ| jd< d}x(| j ¡ D ]}|d|| j| f 7 }q W | ¡ S )z String-Representation for ObjectZ
cmpd_countÚ z%s: %s
)rB   Ú
sd_entriesrH   ÚkeysÚrstrip)r(   Z	retStringÚkeyr   r   r   r,   Y  s
    zp_con.__str__c                sÆ  dd„ ‰ | j  d¡dkr8tjd | j ¡| jd ¡ | _n i | _i | jd< | j | jd d< | jd d | _| jd d | j	d	< tjd
 | jd d ¡| jd ¡ | _
d}d}d}d}d}i | _d}t| j
d ƒ}xL‡ fdd„| j
d D ƒD ].}|d dkr$tj dt|ƒ d t|ƒ d ¡ n|d d dkrBtj d¡ tj ¡  |d7 }|d dkrl|d7 }qêt d|d ¡r˜|d dkrÒ|d7 }qên:t d|d ¡r¶|d7 }qênt d|d ¡rê|d7 }nqêtjd |d ¡| jd ¡ | _| jd  d! }	|	|d"< || j|< |d7 }qêW | jd  ¡ }
d}g | _xŠtt| jƒƒD ]x}| j| }t t|d" ƒ¡}t |¡ | d#t|ƒ¡ |d7 }x&|
D ]}| t|ƒt|| ƒ¡ qŽW | j |¡ qFW d$S )%zYDownload Compound-Data for self.acc_id, these are available in self.sd_entries afterwardsc             S   s&   yt | ƒ dS  tk
r    dS X dS )zCheck for proper Float-ValueTFN)r6   r<   )r   r   r   r   Úlooks_like_numberd  s
    z7p_con.step_0_get_chembl_data.<locals>.looks_like_numberZCHEMBLr0   z6https://www.ebi.ac.uk/chemblws/targets/uniprot/{}.json)ZproxiesÚtargetZchemblIdÚ	chembl_idz<https://www.ebi.ac.uk/chemblws/targets/{}/bioactivities.jsonr   Zbioactivitiesc                s   g | ]}ˆ |d  ƒr|‘qS )Úvaluer   )Ú.0Úrecord)rP   r   r   ú
<listcomp>‡  s    z0p_con.step_0_get_chembl_data.<locals>.<listcomp>éd   úú/z >          <é
   ú|r   ÚorganismzHomo sapiensÚIC50Úbioactivity_typeÚunitsZnMZKiZ
Inhibitionz0https://www.ebi.ac.uk/chemblws/compounds/{}.jsonÚingredient_cmpd_chemblidZcompoundÚsmilesÚSmilesÚ_NameT)rF   ÚfindÚrequestsÚgetÚformatrG   ÚjsonZtarget_datarR   rH   Zbioactivity_dataZdrrB   ÚsysÚstdoutÚwriteÚstrÚflushÚreÚsearchZ	cmpd_datarM   rL   Úranger   ZMolFromSmilesr   ZCompute2DCoordsÚSetPropÚappend)r(   Z	ic50_skipZki_skipZ	inhb_skipÚcountZnon_homoÚir   ZbioactivityZ	my_smilesZSDtagsZcpd_counterÚentryÚcpdÚtagr   )rP   r   Ústep_0_get_chembl_dataa  sx    

"&





zp_con.step_0_get_chembl_datac             C   sl   g }x\| j D ]R}tj|dd}g }x|D ]}| | ¡ ¡ q(W | t|ƒ¡}|| }| |¡ qW || _ dS )z@remove all smaller Fragments per compound, just keep the largestT)ZasMols)rL   r   ZGetMolFragsrr   ZGetNumAtomsÚindexr   )r(   Úresultrv   Z	fragmentsZlist_cpds_fragsizeZfragZlargest_frag_indexZlargest_fragr   r   r   Ústep_1_keeplargestfrag¶  s    
zp_con.step_1_keeplargestfragc             C   s¤   g }i }xH| j D ]>}t |¡ tj|dd}|| ¡ kr@g ||< ||  |¡ qW xF| ¡ D ]:}t|| ƒdkr\|| d  d|¡ | || d ¡ q\W || _ dS )z&remove duplicates from self.sd_entriesT)Ú	canonicalr   r   Úcansmirdkit)rL   r   ÚRemoveHsÚMolToSmilesrM   rr   rB   rq   )r(   rz   Zall_struct_dictrv   Úcansmiru   r   r   r   Ústep_2_remove_duplÆ  s    
zp_con.step_2_remove_duplc          	   C   s  t jdd}dd„ }dd„ }g }i }xT| jD ]J}d| ¡ kr`t |¡ tj|dd	}| d|¡ t| 	d¡ƒ}i ||< q,W xN| jD ]D}t| 	d¡ƒ}y||  
|¡ W q‚ tk
rÄ   |g||< Y q‚X q‚W x"|D ]}t||| ƒƒ}	||| ƒ\}
}|| d
  dt|
ƒ¡ || d
  d|	¡ t|	ƒdt|
ƒ  }t|	ƒdt|
ƒ  }t|
dƒdkrp| 
|| d
 ¡ qÒ|
t|	ƒkr¸g }xh|| D ]&}| 
| 	d¡¡ td|||	|
ƒ qŒW qÒt  |¡|k sìt  |¡|krÚqÒ| 
|| d
 ¡ qÒW || _t j|d |d |d |d d dS )zµmerge IC50 of duplicates into one compound using mean of all values if:
        min(IC50) => IC50_avg-3*IC50_stddev && max(IC50) <= IC50_avg+3*IC50_stddev && IC50_stddev <= IC50_avgÚignore)Úinvalidc          	   S   sb   d}d}xH| D ]@}y|t | d¡ƒ7 }W q tk
rL   td| d¡ƒ Y qX qW |t| ƒ }|S )Nr   rS   zno IC50 reportedrc   )r6   ÚGetPropÚ	ExceptionÚprintrB   )Úmol_listr]   ÚIC50_avgZblar   r   r   Úget_mean_IC50Þ  s    
z.p_con.step_3_merge_IC50.<locals>.get_mean_IC50c          	   S   sl   g }xP| D ]H}y|  tt| d¡ƒdƒ¡ W q
 tk
rP   td| d¡ƒ Y q
X q
W tj|dd}||fS )NrS   r&   zno IC50 reportedrc   r   )Zddof)rr   r   r6   r„   r…   r†   r   Ústd)r‡   Ú	IC50_listÚmolÚIC50_stddevr   r   r   Úget_stddev_IC50é  s    
z0p_con.step_3_merge_IC50.<locals>.get_stddev_IC50r}   T)r|   r   Úvalue_stddevrS   é   r   g        rc   zstddev larger than meanÚoverÚdividerƒ   Úunder)r‘   r’   rƒ   r“   )r   ZseterrrL   ÚGetPropNamesr   r~   r   rq   rl   r„   rr   r…   r6   r   r†   Úminr   )r(   Znp_old_settingsr‰   rŽ   rz   Z	IC50_dictrv   r€   ru   rˆ   r   r‹   ZminimumvalueZmaximumvalueZrunawaylistÚer   r   r   Ústep_3_merge_IC50Ù  sN    

 zp_con.step_3_merge_IC50rS   c             C   s†   g }d\}}xX| j D ]N}t| |¡ƒt|ƒkrD| dd¡ |d7 }n| dd¡ |d7 }| |¡ qW || _ | jr‚td||f ƒ dS )ztset Property "TL"(TrafficLight) for each compound:
        if ic50_tag (default:"value") > threshold: TL = 0, else 1)r   r   ÚTLÚ0r   Ú1z## act: %d, inact: %dT)rL   r6   r„   rq   rr   rJ   r†   )r(   Z	thresholdZic50_tagrz   rt   Újrv   r   r   r   Ústep_4_set_TL  s    
zp_con.step_4_set_TLc          )   C   s¬   ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)g)d*g }g }x@| j D ]6}| ¡ }x|D ]}||krz| |¡ qzW | |¡ qhW || _ d+S ),zzremove list of Properties from each compound (hardcoded)
        which would corrupt process of creating Prediction-ModelsZactivity__commentZalogpZassay__chemblidZassay__descriptionZassay__typeZbioactivity__typeZactivity_commentZassay_chemblidZassay_descriptionZ
assay_typer^   r}   Zingredient__cmpd__chemblidr`   Z	knownDrugZmedChemFriendlyZmolecularFormulaZname__in__referenceZname_in_referenceZnumRo5ViolationsÚoperatorr\   Zparent__cmpd__chemblidZparent_cmpd_chemblidZpassesRuleOfThreeZpreferredCompoundNameZ	referenceZrotatableBondsra   rb   ZstdInChiKeyZsynonymsZtarget__chemblidZtarget_chemblidZtarget__confidenceZtarget__nameZtarget_confidenceZtarget_namer_   Z	value_avgr   rS   T)rL   r”   Z	ClearProprr   )r(   Zsd_tagsrz   rŒ   Z
propertiesrw   r   r   r   Ústep_5_remove_descriptors/  s&    



zp_con.step_5_remove_descriptorsc             C   s~   dd„ t jD ƒ}t |¡}x^tt| jƒƒD ]L}| | j| ¡}x6tt|ƒƒD ]&}| j|  t	|| ƒt	|| ƒ¡ qLW q*W dS )zKcalculate descriptors for each compound, according to Descriptors._descListc             S   s   g | ]}|d  ‘qS )r   r   )rT   r   r   r   r   rV   J  s    z1p_con.step_6_calc_descriptors.<locals>.<listcomp>T)
r   Ú	_descListr   ÚMolecularDescriptorCalculatorrp   rB   rL   ÚCalcDescriptorsrq   rl   )r(   ZnmsZcalcrt   Zdescrsr›   r   r   r   Ústep_6_calc_descriptorsH  s    
*zp_con.step_6_calc_descriptorsc       8      C   s4  ddddddddd	d
dg}|g| _ g }g }t ¡  d¡dd… }d |¡}xÒ| jD ]È}g }g }| ¡ }	x¨|	D ] }
|
dkräy2t| |
¡ƒ}t	 
|¡sœt	 |¡r¨td|
 ƒ W n" tk
rÌ   td|
 ƒ wlY nX | |¡ | |
¡ ql|
dkr| t| |
¡ƒ¡ qlt|
ƒ qlW | |¡ qRW t |¡}t |¡}xætddƒD ]Ö}| jrbtdƒ td| ƒ tj||d|d\}}}}td|d}| ||¡}d}tj||||dd}t| ¡ dƒ}t| ¡ dƒ}ttjdd d!}tj|||||d}t| ¡ dƒ}t| ¡ dƒ}tj||||dd}d"d#„ |D ƒ}t| ¡ dƒ}t| ¡ dƒ}tj||||dd}d$d#„ |D ƒ}t| ¡ dƒ}t| ¡ dƒ}tj||||dd}d%d#„ |D ƒ}t| ¡ dƒ} t| ¡ dƒ}!tj||||d&d}d'd#„ |D ƒ}t| ¡ dƒ}"t| ¡ dƒ}#|  |¡}$t !||$¡}%t"|%ƒ}&t|&d dƒ}'t|&d( dƒ}(|%d) d) })|%d d }*|%d d) }+|%d) d },|*|+ }-|)|, }.ttt#|)|* ƒƒt|-ƒ dƒ}/ttt#|+|, ƒƒt|-ƒ dƒ}0| jrttd*ƒ td+ƒ td,|)|*f ƒ td-|+|,f ƒ t|%ƒ td.ƒ |  |¡}1t !||1¡}2|2d) d) }3|2d d }4|2d d) }5|2d) d }6td+ƒ td,|3|4f ƒ td-|5|6f ƒ t|2ƒ |t$|ƒd/ t$|ƒ t$|ƒd/ t$|ƒ t$|ƒd/ t$|ƒ t$| ƒd/ t$|!ƒ t$|ƒd/ t$|ƒ t$|"ƒd/ t$|#ƒ t$|'ƒd/ t$|(ƒ |/|0d0g}7| j% |¡ | j  |7¡ q@W t&| j%ƒd)kr0dS d S )1zãtrain models according to trafficlight using sklearn.ensamble.RandomForestClassifier
        self.model contains up to 10 models afterwards, use save_model_info(type) to create csv or html
        containing data for each modelú#ZaccuracyZMCCZ	precisionZrecallÚf1r   r%   Z
prevalenceZbiaszpickel-FilerY   éþÿÿÿNú;)r˜   rS   zinvalid: %szvalerror: %sr˜   r   é   z ################################ztry to calculate seed %dgš™™™™™Ù?)Z	test_sizeÚrandom_staterW   )Zn_estimatorsr¨   é   )ZcvZscoringr   TF)Zgreater_is_betterZneeds_thresholdc             S   s   g | ]}t |d ƒ‘qS )r   )r   )rT   r   r   r   r   rV   ’  s    z-p_con.step_7_train_models.<locals>.<listcomp>c             S   s   g | ]}t |d ƒ‘qS )r   )r   )rT   r   r   r   r   rV   ˜  s    c             S   s   g | ]}t |d ƒ‘qS )r   )r   )rT   r   r   r   r   rV   ž  s    Zroc_aucc             S   s   g | ]}t |d ƒ‘qS )r   )r   )rT   r   r   r   r   rV   ¤  s    r#   r   ztest:z	pos	negz
true	%d	%dzfalse	%d	%dz
train:Ú_zmodel_file.pkl)'Úcsv_textÚosÚgetcwdÚsplitÚjoinrL   r”   r6   r„   ÚmathZisnanZisinfr†   r<   rr   r   r   r5   Zarrayrp   rJ   r	   r   r
   ZfitZcross_val_scorer   ZmeanrŠ   r   r   Zmatthews_corrcoefÚpredictZconfusion_matrixr@   r   rl   rI   rB   )8r(   Z
title_lineZTL_listZproperty_list_listZ	directoryZ
dir_stringrv   Zproperty_listZproperty_name_listZ	prop_nameÚpropertyÚfÚdataDescrs_arrayZdataActs_arrayZrandomseedcounterZX_trainZX_testZy_trainZy_testÚclf_RFZ
cv_counterZscoresZaccuracy_CVZaccuracy_std_CVZcalcMCCZMCC_CVZ
MCC_std_CVZscores_roundedZf1_CVZ	f1_std_CVZprecision_CVZprecision_std_CVZ	recall_CVZrecall_std_CVZauc_CVZ
auc_std_CVÚ	y_predictZconf_matrixZ	coh_kappar%   Zkappa_stdevÚtpZtnÚfpÚfnÚnÚpZkappa_prevalenceZ
kappa_biasZ
y_predict2Zconf_matrix2Ztp2Ztn2Úfp2Zfn2Zresult_string_cutr   r   r   Ústep_7_train_modelsR  sÒ    







$$&zp_con.step_7_train_modelsÚhtmlc                sì   |dkr`|  d¡s|d7 }t|dƒ}tj|ddd}xˆjD ]}| |¡ q<W | ¡  | ¡  nˆ|dkrè|  d¡sz|d7 }d	d
„ }‡ ‡fdd„}dd„ }dd„ ‰dd„ }	‡‡fdd„‰ |	ˆjƒ\}
}ˆj}||ƒ}|||
|ƒ}|||ƒ dS )zGcreate html- or csv-File for models according to mode (default: "html")Úcsvz.csvÚwbr¦   ú )Z	delimiterZ	quotecharr¾   z.htmlc             S   s   | S )Nr   )Úlinesr   r   r   Ú
lines2listë  s    z)p_con.save_model_info.<locals>.lines2listc                s4  d}d}d}d}d}d}d}	d}
d	}d}ˆ| ƒ\}}g }|  |¡ |  |||f ¡ |  |¡ |  |t| d
 ƒ ¡ d
}xš| dt| ƒ… D ]†}g }x@|D ]8}t|ƒ}| d¡dkrÆ|  | dd¡¡ q˜|  |¡ q˜W d}||kräd}||krðd}|  |t|g| ƒ ¡ |d7 }qŠW |  |¡ |  |	¡ ˆ | ƒ |S )Na¤  <html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title></title>
<style type="text/css">
table {
  max-width: 100%;
  background-color: transparent;
}

th {
  text-align: left;
}

.table {
  width: 100%;
  margin-bottom: 20px;
}

.table > thead > tr > th,
.table > tbody > tr > th,
.table > tfoot > tr > th,
.table > thead > tr > td,
.table > tbody > tr > td,
.table > tfoot > tr > td {
  padding: 8px;
  line-height: 1.428571429;
  vertical-align: top;
  border-top: 1px solid #dddddd;
}

.table > thead > tr > th {MSC1013123
  vertical-align: bottom;
  border-bottom: 2px solid #dddddd;
}

.table > caption + thead > tr:first-child > th,
.table > colgroup + thead > tr:first-child > th,
.table > thead:first-child > tr:first-child > th,
.table > caption + thead > tr:first-child > td,
.table > colgroup + thead > tr:first-child > td,
.table > thead:first-child > tr:first-child > td {
  border-top: 0;
}

.table > tbody + tbody {
  border-top: 2px solid #dddddd;
}

.table .table {
  background-color: #ffffff;
}

.table-condensed > thead > tr > th,
.table-condensed > tbody > tr > th,
.table-condensed > tfoot > tr > th,
.table-condensed > thead > tr > td,
.table-condensed > tbody > tr > td,
.table-condensed > tfoot > tr > td {
  padding: 5px;
}

.table-bordered {
  border: 1px solid #dddddd;
}

.table-bordered > thead > tr > th,
.table-bordered > tbody > tr > th,
.table-bordered > tfoot > tr > th,
.table-bordered > thead > tr > td,
.table-bordered > tbody > tr > td,
.table-bordered > tfoot > tr > td {
  border: 1px solid #dddddd;
}

.table-bordered > thead > tr > th,
.table-bordered > thead > tr > td {
  border-bottom-width: 2px;
}

.table-striped > tbody > tr:nth-child(odd) > td,
.table-striped > tbody > tr:nth-child(odd) > th {
  background-color: #f9f9f9;
}

.table-hover > tbody > tr:hover > td,
.table-hover > tbody > tr:hover > th {
  background-color: #f5f5f5;
}

table col[class*="col-"] {
  position: static;
  display: table-column;
  float: none;
}

table td[class*="col-"],
table th[class*="col-"] {
  display: table-cell;
  float: none;
}

.table > thead > tr > .active,
.table > tbody > tr > .active,
.table > tfoot > tr > .active,
.table > thead > .active > td,
.table > tbody > .active > td,
.table > tfoot > .active > td,
.table > thead > .active > th,
.table > tbody > .active > th,
.table > tfoot > .active > th {
  background-color: #f5f5f5;
}

.table-hover > tbody > tr > .active:hover,
.table-hover > tbody > .active:hover > td,
.table-hover > tbody > .active:hover > th {
  background-color: #e8e8e8;
}

.table > thead > tr > .success,
.table > tbody > tr > .success,
.table > tfoot > tr > .success,
.table > thead > .success > td,
.table > tbody > .success > td,
.table > tfoot > .success > td,
.table > thead > .success > th,
.table > tbody > .success > th,
.table > tfoot > .success > th {
  background-color: #dff0d8;
}

.table-hover > tbody > tr > .success:hover,
.table-hover > tbody > .success:hover > td,
.table-hover > tbody > .success:hover > th {
  background-color: #d0e9c6;
}

.table > thead > tr > .danger,
.table > tbody > tr > .danger,
.table > tfoot > tr > .danger,
.table > thead > .danger > td,
.table > tbody > .danger > td,
.table > tfoot > .danger > td,
.table > thead > .danger > th,
.table > tbody > .danger > th,
.table > tfoot > .danger > th {
  background-color: #f2dede;
}

.table-hover > tbody > tr > .danger:hover,
.table-hover > tbody > .danger:hover > td,
.table-hover > tbody > .danger:hover > th {
  background-color: #ebcccc;
}

.table > thead > tr > .warning,
.table > tbody > tr > .warning,
.table > tfoot > tr > .warning,
.table > thead > .warning > td,
.table > tbody > .warning > td,
.table > tfoot > .warning > td,
.table > thead > .warning > th,
.table > tbody > .warning > th,
.table > tfoot > .warning > th {
  background-color: #fcf8e3;
}

.table-hover > tbody > tr > .warning:hover,
.table-hover > tbody > .warning:hover > td,
.table-hover > tbody > .warning:hover > th {
  background-color: #faf2cc;
}

@media (max-width: 767px) {
  .table-responsive {
    width: 100%;
    margin-bottom: 15px;
    overflow-x: scroll;
    overflow-y: hidden;
    border: 1px solid #dddddd;
    -ms-overflow-style: -ms-autohiding-scrollbar;
    -webkit-overflow-scrolling: touch;
  }
  .table-responsive > .table {
    margin-bottom: 0;
  }
  .table-responsive > .table > thead > tr > th,
  .table-responsive > .table > tbody > tr > th,
  .table-responsive > .table > tfoot > tr > th,
  .table-responsive > .table > thead > tr > td,
  .table-responsive > .table > tbody > tr > td,
  .table-responsive > .table > tfoot > tr > td {
    white-space: nowrap;
  }
  .table-responsive > .table-bordered {
    border: 0;
  }
  .table-responsive > .table-bordered > thead > tr > th:first-child,
  .table-responsive > .table-bordered > tbody > tr > th:first-child,
  .table-responsive > .table-bordered > tfoot > tr > th:first-child,
  .table-responsive > .table-bordered > thead > tr > td:first-child,
  .table-responsive > .table-bordered > tbody > tr > td:first-child,
  .table-responsive > .table-bordered > tfoot > tr > td:first-child {
    border-left: 0;
  }
  .table-responsive > .table-bordered > thead > tr > th:last-child,
  .table-responsive > .table-bordered > tbody > tr > th:last-child,
  .table-responsive > .table-bordered > tfoot > tr > th:last-child,
  .table-responsive > .table-bordered > thead > tr > td:last-child,
  .table-responsive > .table-bordered > tbody > tr > td:last-child,
  .table-responsive > .table-bordered > tfoot > tr > td:last-child {
    border-right: 0;
  }
  .table-responsive > .table-bordered > tbody > tr:last-child > th,
  .table-responsive > .table-bordered > tfoot > tr:last-child > th,
  .table-responsive > .table-bordered > tbody > tr:last-child > td,
  .table-responsive > .table-bordered > tfoot > tr:last-child > td {
    border-bottom: 0;
  }
}
</style>
</head>
<body>
<p style="padding-left:10px;padding-top:10px;font-size:200&#37;">Data for Models</p>
<p style="padding-left:10px;padding-right:10px;">zè<table style="vertical-align:top; background-color=#CCCCCC">
<tr align="left" valign="top"><td><img src="pieplot.png"></td><td><H3>Distribution</H3><font color="#00C000">active %d</font><br><font color="#FF0000">inactive %d</td><td>z</td></tr></table>zÜ<table class="table table-bordered table-condensed">
<thead>
<tr>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
</tr>
</thead>
<tbody>z¶
<tr bgcolor = "%s">
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td><a href="%s">model.pkl</a></td>
</tr>z-</tbody>
</table>
<img src="barplot.png"><br>z
</p>
</body>
</html>zô<table class="table table-bordered table-condensed">
<thead>
<tr>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
<th>%s</th>
</tr>
</thead>
<tbody>zÍ<tr bgcolor = "%s">
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td>%s</td>
<td><a href="%s">model.pkl</a></td>
</tr>r   r   Zpklr0   rª   õ   Â±rK   z#9CC089z#FF3333)rr   ÚtuplerB   rl   rd   Úreplace)rC   ÚactÚinactZ	html_headZhtml_topPlot_startZhtml_topPlot_bottomZhtml_tableStartZhtml_tElementsZhtml_bottomPlotZ	html_footZhtml_kappa_table_headZhtml_kappa_table_elementZhtml_kappa_table_bottomÚbestÚworstr¾   rt   ÚlZ
l_replacedÚelemZelem_stringÚc)ÚcreateBarPlotÚfindBestWorstr   r   Ú	list2htmlî  sH     c




z(p_con.save_model_info.<locals>.list2htmlc             S   s6   t |dƒ}x| D ]}| |¡ qW | ¡  | ¡  d S )Nr?   )Úopenrk   rm   Úclose)r¾   ZoutfZoutf_hÚblockr   r   r   Ú	writeHtmlF  s    

z(p_con.save_model_info.<locals>.writeHtmlc             S   s<   dd„ | dd … D ƒ}|  t|ƒ¡|  t|ƒ¡ }}||fS )Nc             S   s"   g | ]}t |d   d¡d ƒ‘qS )é   rª   r   )r6   r®   )rT   r   r   r   r   rV   O  s    z@p_con.save_model_info.<locals>.findBestWorst.<locals>.<listcomp>r   )ry   r   r•   )rC   r   Z	max_indexZ	min_indexr   r   r   rÏ   N  s    z,p_con.save_model_info.<locals>.findBestWorstc             S   sX   dd„ }|| ƒ\}}t d||f ƒ tjdd}tj||gdd}|jdd	d
 ||fS )Nc             S   sB   d\}}x0| D ](}t | d¡ƒdkr.|d7 }q|d7 }qW ||fS )N)r   r   r˜   r   r   )r   r„   )ÚcpdsrÇ   rÈ   rv   r   r   r   ÚgetActInactU  s    

zAp_con.save_model_info.<locals>.createPiePlot.<locals>.getActInactzact/inact from TL's %d/%d)r&   r&   )Zfigsize)ÚrÚg)Zcolorszpieplot.pngT)Útransparent)r†   ÚpltZfigureÚpieÚsavefig)rÖ   r×   Z	act_countZinact_countÚfigrÜ   r   r   r   ÚcreatePiePlotS  s    	z,p_con.save_model_info.<locals>.createPiePlotc                sb  dd„ ‰‡ ‡‡‡‡fdd„}t  ¡ \}‰ | dd¡ t ddd	¡‰tˆjƒd
kr^t ddd¡‰d‰|dƒ}ˆ  ˆd ¡ ˆ  dd„ t	d
dd
ƒD ƒ¡ ˆ  
d¡ ˆ  d¡ ˆ  dd¡ ˆ  dd	¡ ˆ  t|ƒdd„ ˆd d
d… D ƒd¡ ˆˆƒ\}}tˆjƒd
krPˆ jdˆ| dfˆ| d dfdd  ˆ jd!ˆ| dfˆ| d dfd"d  |jd#d$d% d S )&Nc             S   s€   g }g }xn| dd … D ]^}||   d¡dkr.q||   d¡dkrBq||  d¡}| t|d ƒ¡ | t|d ƒ¡ qW ||fS )Nr   rª   r0   z.pklr   )rd   r®   rr   r6   )rC   ÚcolZaccListÚerrListr   Zsplr   r   r   ÚgetListsg  s    z>p_con.save_model_info.<locals>.createBarPlot.<locals>.getListsc          
      sf   g }dddddddg}xJt d| ƒD ]<}ˆˆ|ƒ\}}| ˆ jˆˆ|  |ˆ||d  |d	¡ q"W |S )
Nz#DD1E2Fz#EBB035z#06A2CBz#218559z#D0C6B1z#192823z#DDAACCr   )ÚcolorZyerr)rp   rr   Zbar)Zcntrz   Zclrrt   Úlistrá   )ÚaxrC   râ   Úticksr   r   r   Ú	plotListst  s    .z?p_con.save_model_info.<locals>.createBarPlot.<locals>.plotListsé   rÕ   g        g      (@g333333ó?r   g      ð?g      ø?g333333Ã?é   g      è?c             S   s   g | ]}t |ƒ‘qS r   )rl   )rT   r   r   r   r   rV   ‡  s    z@p_con.save_model_info.<locals>.createBarPlot.<locals>.<listcomp>r§   ZAccuracyz# modelg333333Ó¿é   gš™™™™™¹¿c             S   s   g | ]}|‘qS r   r   )rT   r   r   r   r   rV   Œ  s    r   zupper rightrÉ   g333333ë?g      Ð?gš™™™™™ñ?Zgreen)ZxyZxytextrã   rÊ   Zredzbarplot.pngT)rÚ   )rÛ   ZsubplotsZset_size_inchesr   r9   rB   rI   Z
set_xticksZset_xticklabelsrp   Z
set_ylabelZ
set_xlabelZset_xlimZset_ylimZlegendrÅ   ZannotaterÝ   )rC   rç   rÞ   ZplotsrÉ   rÊ   )rÏ   r(   )rå   rC   râ   ræ   r   r   rÎ   e  s0    

(z,p_con.save_model_info.<locals>.createBarPlotT)	ÚendswithrÑ   r¿   Úwriterr«   Zwriterowrm   rÒ   rL   )r(   ÚoutfileÚmodeZcsv_fileZcsv_file_writerÚlinerÃ   rÐ   rÔ   rß   rÇ   rÈ   rÂ   rC   r¾   r   )rÎ   rÏ   r(   r   Úsave_model_infoÜ  s4    



  Z1
zp_con.save_model_infoc             C   sD   |  d¡s|  d¡r&t t |¡¡}n
t |¡}dd„ |D ƒ| _dS )z)load SD-File from .sdf, .sdf.gz or .sd.gzz.sdf.gzz.sd.gzc             S   s   g | ]}|‘qS r   r   )rT   rŒ   r   r   r   rV   £  s    z#p_con.load_mols.<locals>.<listcomp>T)rë   r   ZForwardSDMolSupplierÚgziprÑ   ZSDMolSupplierrL   )r(   Zsd_fileZSDFiler   r   r   Ú	load_mols  s
    
zp_con.load_molsTc             C   sž   t  |d ¡}x| jD ]}| |¡ qW | ¡  | ¡  |sPt |d |¡ dS t|d dƒ}| |d¡}| 	|¡ | ¡  | ¡  | ¡  t 
|d ¡ dS )z6create SD-File of current molecules in self.sd_entriesz.tmpNÚrbrÀ   )r   r   rL   rk   rm   rÒ   r¬   ÚrenamerÑ   Ú
writelinesÚremove)r(   rí   rñ   ZsdwrŒ   Zf_inZf_outr   r   r   Ú	save_mols¦  s     
zp_con.save_molsr   c             C   s   t  | j| t|dƒ¡ dS )z$save Model to file using pickle.dumpzwb+N)ÚpickleÚdumprI   Úfile)r(   rí   Úmodel_numberr   r   r   Ú
save_model¹  s    zp_con.save_modelc             C   s^   t |ƒtkr|g}d}xB|D ]:}t|dƒ}t|ƒ}| ¡ }| j |¡ | ¡  |d7 }qW |S )z,load model or list of models into self.modelr   rØ   r   )Útyperl   rÑ   r   ÚloadrI   rr   rÒ   )r(   Zmodel_filesrt   Zmod_filerI   Z	unPickledrµ   r   r   r   Úload_models¾  s    

zp_con.load_modelsc          	   C   s  t | jƒ|kr2tj d|t | jƒf ¡ t d¡ g }d\}}xtjD ]}| |d ¡ qFW t	 
|¡}| j| }xš| jD ]}d}	y| |¡}
d}	W n$ tk
r¶   tj d| ¡ Y nX |	rxt |
¡}t| |¡d ƒ}|dkrè|d7 }|dkrø|d7 }| d	t|ƒ¡ qxW ||fS )
z>try to predict activity of compounds using giving model-Numberz9
Model-Number %d doesn't exist, there are just %d Models
r0   )r   r   r   FTz(Error computing descriptors for %s, skipr   ZTL_prediction)rB   rI   ri   Ústderrrk   Úexitr   rŸ   rr   r   r    rL   r¡   r–   r   r5   r   r±   rq   rl   )r(   rû   ZdescriptorsZactiveZinactiveÚDZ
calculatorrµ   ZsampleZuseÚpatternr´   r¶   r   r   r   r±   Ì  s4    




zp_con.predict)rS   )r¾   )T)r   )r-   r.   r/   Ú__doc__r*   r,   rx   r{   r   r—   rœ   rž   r¢   r½   rð   rò   r÷   rü   rÿ   r±   r   r   r   r   rE   M  s*   UC

 
   D	

rE   Ú__main__c             C   s   t j d|  ¡ d S )NzError in Step: %s)ri   r   rk   )Ústepr   r   r   Ú
step_errorï  s    r  a  usage: python master.py [--accession=<Acc_ID>] [--sdf=<sdf-File>] --dupl/--uniq [--rof] [--combine=<file1>,<file2>] [--IC50=<IC50_tag>] [--cutoff=<value>] [--remove_descr=<txt_file>] [--proxy=<https://user:pass@proxy.de:portnumber] [--verbous] [--check_models=<model.pkl>])Úusagez--accessionZstoreÚstringÚ	accessionzPAccession ID of Protein (hint: P43088 is Vitamin_D_Receptor with ~200 compounds)rK   )Úactionrý   ÚdestÚhelpÚdefaultz--rofÚ
store_trueZonefilezremove obsolete FilesF)r  r  r  r  z--duplÚduplzuse only duplicatesz--uniqÚuniqzuse only uniquesz	--combineÚcombinezCombine 2 SDF/SDF.GZ Filesz--IC50ZSD_tagz&name of IC50 field, default is 'value'rS   z--cutoffr   Úcutoffz5cutoff-value for hERG-trafficlight, default is '5000'iˆ  z--remove_descrZremove_descrz:file with SDtags2remove, line-wise default:<internal list>z--proxyrG   zUse this Proxyz--sdfÚsdfzload this SDF-Filez	--verbousrJ   z--check_modelsÚ	modelfilezcheck compounds with this modelú,zneed 2 files to combiner0   r&   zFile: %sú:z)please offer Accession-Number or SDF-Filez-h for helpz&Please select uniq or dupl -h for help)rG   zload sdf from File: %szload SDF-Filez!gather Data for Accession-ID '%s'zdownload ChEMBL-Datazkeep largest Fragmentzremove duplicatesz!merge IC50-Values for same SmileszLoad Model-Filesz
#Model	Active	Inactivez%d	%d	%dzset Trafficlight for cutoffzremove descriptorszcalculate DescriptorszTraining of Modelszmodel_info.csvr¿   )rî   zmodel_info.htmlr¾   z%s_%dnm_model_%d.pklzModel %d saved into File: %sz Model %d active: %d	inactive: %d)r   r   )NTN)N)]r¬   rn   rñ   rh   re   ri   Zoptparser¿   Zrdkitr   Z
rdkit.Chemr   r   r   Zrdkit.ML.Descriptorsr   Zscipyr   r   Zsklearnr	   Zsklearn.ensembler
   r   Zsklearn.cross_validationr   Zsklearn.metricsr   r   r   r   r   rø   r   Znumpyr   r°   Zpylabr   r+   r    Údictr!   r@   rD   rE   r-   r  r  ZOptionParserÚparserZ
add_optionÚ
parse_argsZoptionsÚargsr  r®   ZcombineItemsrB   r†   r  Z_04Zcur_filer
  Úcoder  r  r  rG   ZpcorJ   rò   rz   rx   r{   r   r—   r  rÿ   rp   rI   rt   r±   rÇ   rÈ   rœ   r  rž   r¢   r½   rð   Úfilenamerü   r   r   r   r   Ú<module>   s$  @

 #
=       '





 2













