-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathjmultidk.py
651 lines (511 loc) · 17.5 KB
/
jmultidk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# Python 3
"""
Codes for MultiDK are included.
Author: (James) Sung-Jin Kim
Date: April 2, 2016 ~ Now
Editorial notes
* April2, 2016
I will use pd.DataFrame as base class to make new class for R2_DF,
which is specialized for r^2 scores and support MultiDK.
Hence, it can be copied to jpandas later on.
"""
from time import time
import pandas as pd
from functools import reduce
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Jan 27, 2017
#cross_validation is deprecated and replace it to model_selection
from sklearn import model_selection #cross_validation
from sklearn import metrics
import tensorflow.contrib.learn as skflow
import jpandas as jpd
import jchem, jgrid
import j3x.jpyx
import jseaborn as jsns
import kutil
def list_agg_n( n_folds):
"""
Generate a function to aggregate multiple lists
for functools.reduce()
"""
f = lambda s, x: s + [x] * n_folds
return f
class R2_Scores( object):
def __init__(self, fname = None, Nm = 10, n_alphas = 10, n_folds = 20):
"""
Make a class for r^2 scores based on pd.DataFrame
[Input]
fname: file name for r^2 dataframe
Nm: the number of methods
E.g.
fname = "sheet/wang3705_MDMK2to23_{}methods.csv".format(Nm)
"""
if fname is not None:
self.fname = fname
self.df = pd.read_csv( fname)
else:
self.df = pd.DataFrame()
self.Nm = Nm
self.n_alphas = n_alphas
self.n_folds = n_folds
def updata_for_multidk(self, fname_out = None):
"""
1. alpha_ID is added on the DataFrame since float point values such as alpha
can not be found by value. An index of the best alpha value for each method
will be stored as well as the best alpha so that the index will be used
to filter the best values of mean(r2) and std(r2).
"""
n_alphas = self.n_alphas
n_folds = self.n_folds
# Step 1: adding alpha_ID
self.df["alpha_ID"] = reduce( list_agg_n( n_folds), range( n_alphas), []) * self.Nm
# Step 2: change names MDMKx to MultiDKx
rn_d = {'MDMK2to11':"MultiDK2-11", 'MDMK2to21': "MultiDK2-21",
'MDMK2to23': "MultiDK2-23", 'MDMK2to13': "MultiDK2-13", "MDMK1to10":"MultiDK1-10",
"MDMK": "MultiDK"} # MDMK is included for legacy cases such as redox potential prediction
df_method_l = self.df["Method"].tolist()
rn_l = []
for m in df_method_l:
if m in rn_d.keys():
rn_l.append( rn_d[m])
else:
rn_l.append( m)
self.df["Method"] = rn_l
if fname_out is not None:
self.df.to_csv( fname_out, index = False)
elif self.fname is not None:
fname_out = self.fname[:-4] + '_refine.csv'
print( "Default: self.df is saved to", fname_out)
self.df.to_csv( fname_out, index = False)
return self.df
def mean_std(self, fname_out = None):
df_g = self.df.groupby(["Method", "alpha"])
self.df_gr = df_g.agg({"r2":[np.mean, np.std]})["r2"]
# Index should be stored.
if fname_out is not None:
"""
index should be saved so 'index = True' as default
"""
self.df_gr.to_csv( fname_out)
elif self.fname is not None:
fname_out = self.fname[:-4] + '_mean_std.csv'
print( "Default: self.df_gr is saved to", fname_out)
self.df_gr.to_csv( fname_out)
return self.df_gr
def max_mean_r2( self, fname_out = None):
"""
Extact all method names and get a set with only unique name
"""
self.method_l = set(self.df_gr.index.get_level_values(0))
pdi_l = list()
for m in self.method_l:
p_m = self.df_gr.loc[ m]
alpha_l = p_m.index.tolist()
m_r2 = p_m["mean"].values
std_r2 = p_m["std"].values
i_max = m_r2.argmax()
pdi = pd.DataFrame( [[m, i_max, alpha_l[i_max], m_r2[i_max], std_r2[i_max]]],
columns=["Method", "best_alpha_ID", "best_alpha", "E[r2]", "std(r2)"])
pdi_l.append( pdi)
pdo_best = pd.concat( pdi_l, ignore_index=True).sort_values("Method")
self.df_best = pdo_best.set_index("Method")
if fname_out is not None:
self.df_best.to_csv( fname_out) # index should be stored.
elif self.fname is not None:
fname_out = self.fname[:-4] + '_best4bar.csv'
print( 'Default: self.df_best is saved to', fname_out)
self.df_best.to_csv( fname_out) # index should be stored.
return self.df_best
def get_box_data( self, fname_out = None):
"""
DataFrame is arranged for box plot.
"""
pdo = self.df
cond = None
for m in self.method_l:
best_alpha_ID = self.df_best.loc[ m]["best_alpha_ID"]
if cond is None:
cond = (pdo.Method == m) & (pdo.alpha_ID == best_alpha_ID)
else:
cond |= (pdo.Method == m) & (pdo.alpha_ID == best_alpha_ID)
self.df_best_expand = self.df[ cond].reset_index( drop = True)
if fname_out is not None:
self.df_best_expand.to_csv( fname_out) # index should be stored.
elif self.fname is not None:
fname_out = self.fname[:-4] + '_best4box.csv'
print( 'Default: self.df_best_expand is saved to', fname_out)
self.df_best_expand.to_csv( fname_out) # index should be stored.
return self.df_best_expand
def run( self):
self.updata_for_multidk()
self.mean_std()
self.max_mean_r2()
self.get_box_data()
return self.df_best, self.df_best_expand
def plot_bar( self, fname_out = None):
self.df_best.plot( kind = 'bar', y = "E[r2]", yerr="std(r2)", legend=False)
if fname_out is not None:
plt.savefig( fname_out) # index should be stored.
elif self.fname is not None:
fname_out = self.fname[:-4] + '_bar.eps'
print( 'Default: the figure of self.df_best is saved to', fname_out)
plt.savefig( fname_out)
def plot_box( self, fname_out = None):
sns.boxplot(x="Method", y="r2", data=self.df_best_expand, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.ylabel( r"$r^2$")
plt.xlabel( "Methods")
if fname_out is not None:
plt.savefig( fname_out) # index should be stored.
elif self.fname is not None:
fname_out = self.fname[:-4] + '_box.eps'
print( 'Default: the figure of self.df_best_expand is saved to', fname_out)
plt.savefig( fname_out)
def run_with_plot( self):
self.run()
self.plot_bar()
plt.show()
self.plot_box()
plt.show()
def set_X_23( s_l, xM_logP):
# s_l = self.s_l
# xM_logP = self.xM_logP
# Body
xM_d = dict()
xM_d["MFP"] = jchem.get_xM( s_l, radius=4, nBits=2048)
xM_d["MACCS"] = jchem.get_xM_MACCSkeys( s_l)
xM_d["MolW"] = jchem.get_xM_molw( s_l)
xM_d["LASA"] = jchem.get_xM_lasa( s_l)
xM_d["logP"] = xM_logP
for d_s in ["MolW", "LASA", "logP"]:
# x_a = xM_d[ d_s]
# x_a = np.divide( x_a, np.std( x_a, axis = 0)) # Normalize
xM_d[ d_s] = np.divide( xM_d[ d_s], np.std( xM_d[ d_s], axis = 0))
xM_2 = np.concatenate( [xM_d["MFP"], xM_d["MACCS"]], axis = 1)
xM_p = np.concatenate( [xM_d[ d_s] for d_s in ["MolW", "LASA", "logP"]], axis = 1) # Concatenation of associated properties
print( xM_2.shape, xM_p.shape)
# Output processing
#self.xM_d = xM_d
#self.xM_2 = xM_2
#self.xM_p = xM_p
return xM_d, xM_2, xM_p
def set_X_23_M2( s_l, xM_logP):
# s_l = self.s_l
# xM_logP = self.xM_logP
# Body
xM_d = dict()
xM_d["MFP"] = jchem.get_xM( s_l, radius=4, nBits=2048)
xM_d["MACCS"] = jchem.get_xM_MACCSkeys( s_l)
xM_d["MolW"] = jchem.get_xM_molw( s_l)
xM_d["MolW2"] = np.power( jchem.get_xM_molw( s_l), 2)
xM_d["LASA"] = jchem.get_xM_lasa( s_l)
xM_d["LASA2"] = jchem.get_xM_lasa( s_l)
xM_d["logP"] = xM_logP
for d_s in ["MolW", "MolW2", "LASA", "LASA2", "logP"]:
xM_d[ d_s] = np.divide( xM_d[ d_s], np.std( xM_d[ d_s], axis = 0))
xM_2 = np.concatenate( [xM_d["MFP"], xM_d["MACCS"]], axis = 1)
xM_p = np.concatenate( [xM_d[ d_s] for d_s in ["MolW", "LASA", "logP"]], axis = 1) # Concatenation of associated properties
print( xM_2.shape, xM_p.shape)
# Output processing
#self.xM_d = xM_d
#self.xM_2 = xM_2
#self.xM_p = xM_p
return xM_d, xM_2, xM_p
def set_A_2( xM_d, xM_2):
# Input processing
#xM_d = self.xM_d
#xM_2 = self.xM_2
# Body
A_d = dict()
for key in ["MFP", "MACCS"]:
print( key)
A_d[ key] = j3x.jpyx.calc_tm_sim_M( xM_d[key])
A_2 = j3x.jpyx.calc_tm_sim_M( xM_2)
# Output processing
#self.A_d = A_d
#self.A_2 = A_2
return A_d, A_2
def set_alpha_log( a_st = -2, a_ed = 2, a_n = 2):
"""
Generate alpha_log with a range from a_st to s_ed
with a_n for each unit.
"""
a_N = (a_ed - a_st)*a_n + 1
return (a_st, a_ed, a_N)
class MultiDK():
def __init__(self, fname = 'sheet/wang3705_with_logP.csv'):
self.fname_core = fname[:-14]
self.pdr = pd.read_csv( fname)
self.alphas_log = set_alpha_log( -2, 2, 2)
def set_xy(self):
"""
if self is changed self will be a return value
for feedback all outputs using only a variable
"""
pdr = self.pdr
self.s_l = self.pdr.SMILES.tolist()
self.xM_logP = np.mat(self.pdr.logP.values).T
self.yV = jpd.pd_get_yV( self.pdr, y_id="exp")
return self
def set_X( self):
# Input processing
s_l = self.s_l
xM_logP = self.xM_logP
# BOdy
xM_d, xM_2, xM_p = set_X_23( s_l, xM_logP)
# Output processing
self.xM_d = xM_d
self.xM_2 = xM_2
self.xM_p = xM_p
return self
def set_X_M2( self):
# Input processing
s_l = self.s_l
xM_logP = self.xM_logP
# BOdy
xM_d, xM_2, xM_p = set_X_23_M2( s_l, xM_logP)
# Output processing
self.xM_d = xM_d
self.xM_2 = xM_2
self.xM_p = xM_p
return self
def set_A( self):
# Input processing
xM_d = self.xM_d
xM_2 = self.xM_2
# Body
A_d, A_2 = set_A_2( xM_d, xM_2)
# Output processing
self.A_d = A_d
self.A_2 = A_2
return self
def grid_search_sd( self):
# input processing
xM_d = self.xM_d
xM_p = self.xM_p
yV = self.yV
A_d = self.A_d
A_2 = self.A_2
#Body
t = time()
pdi_d = dict()
pdi_d["SD"] = jsns.pdi_gs_full( "SD", [xM_d["MFP"]], yV, expension = True, n_jobs = 1)
print('Elasped time is', time() - t, 'sec')
pdi_d["MD21"] = jsns.pdi_gs_full( "MD21", [xM_d[ d_s] for d_s in ["MFP", "MACCS", "MolW"]], yV,
expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MD23"] = jsns.pdi_gs_full( "MD23", list(xM_d.values()), yV, expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK1to10"] = jsns.pdi_gs_full( "MDMK1to10", [A_d["MFP"]], yV,
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to11"] = jsns.pdi_gs_full( "MDMK2to11", [A_2], yV, X_concat = xM_d["MolW"],
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to13"] = jsns.pdi_gs_full( "MDMK2to13", [A_2], yV, X_concat = xM_p,
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to21"] = jsns.pdi_gs_full( "MDMK2to21", [A_d["MFP"], A_d["MACCS"]], yV, X_concat = xM_d["MolW"],
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to23"] = jsns.pdi_gs_full( "MDMK2to23", [A_d["MFP"], A_d["MACCS"]], yV, X_concat = xM_p,
mode = "BIKE_Ridge", expension = True, n_jobs = 1)
print('Elasped time is', time() - t, 'sec')
pdo = pd.concat( pdi_d.values())
print( pdo.shape)
Nm = len(pdi_d)
#print( "The number of methods now is", Nm)
fname_out = self.fname_core + "_MDMK2to23_{}methods.csv".format(Nm)
print("The performance data are save to", fname_out)
pdo.to_csv( fname_out, index = False)
self.pdo = pdo
return self
def grid_search( self):
# input processing
xM_d = self.xM_d
xM_p = self.xM_p
yV = self.yV
A_d = self.A_d
A_2 = self.A_2
#Body
t = time()
pdi_d = dict()
for k in xM_d:
s = "SD({})".format( k)
pdi_d[s] = jsns.pdi_gs_full( s, [xM_d[k]], yV, expension = True, n_jobs = 1)
print('Elasped time is', time() - t, 'sec')
# pdi_d["SD(MFP)"] = jsns.pdi_gs_full( "SD(MFP)", [xM_d["MFP"]], yV, expension = True, n_jobs = 1)
# print('Elasped time is', time() - t, 'sec')
# pdi_d["SD(MACCS)"] = jsns.pdi_gs_full( "SD(MACCS)", [xM_d["MACCS"]], yV, expension = True, n_jobs = 1)
# print('Elasped time is', time() - t, 'sec')
# pdi_d["SD(MolW)"] = jsns.pdi_gs_full( "SD(MolW)", [xM_d["MolW"]], yV, expension = True, n_jobs = 1)
# print('Elasped time is', time() - t, 'sec')
# pdi_d["SD(LASA)"] = jsns.pdi_gs_full( "SD(LASA)", [xM_d["MolW"]], yV, expension = True, n_jobs = 1)
# print('Elasped time is', time() - t, 'sec')
# pdi_d["SD(logP)"] = jsns.pdi_gs_full( "SD(logP)", [xM_d["logP"]], yV, expension = True, n_jobs = 1)
# print('Elasped time is', time() - t, 'sec')
pdi_d["MD21"] = jsns.pdi_gs_full( "MD21", [xM_d[ d_s] for d_s in ["MFP", "MACCS", "MolW"]], yV,
expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MD23"] = jsns.pdi_gs_full( "MD23", list(xM_d.values()), yV, expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK1to10"] = jsns.pdi_gs_full( "MDMK1to10", [A_d["MFP"]], yV,
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to11"] = jsns.pdi_gs_full( "MDMK2to11", [A_2], yV, X_concat = xM_d["MolW"],
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to13"] = jsns.pdi_gs_full( "MDMK2to13", [A_2], yV, X_concat = xM_p,
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to21"] = jsns.pdi_gs_full( "MDMK2to21", [A_d["MFP"], A_d["MACCS"]], yV, X_concat = xM_d["MolW"],
mode = "BIKE_Ridge", expension = True)
print('Elasped time is', time() - t, 'sec')
pdi_d["MDMK2to23"] = jsns.pdi_gs_full( "MDMK2to23", [A_d["MFP"], A_d["MACCS"]], yV, X_concat = xM_p,
mode = "BIKE_Ridge", expension = True, n_jobs = 1)
print('Elasped time is', time() - t, 'sec')
pdo = pd.concat( pdi_d.values())
print( pdo.shape)
Nm = len(pdi_d)
#print( "The number of methods now is", Nm)
fname_out = self.fname_core + "_MDMK2to23_{}methods.csv".format(Nm)
print("The performance data are save to", fname_out)
pdo.to_csv( fname_out, index = False)
self.pdo = pdo
return self
def cv_MultiDK23( self, alpha, n_jobs = 1):
"""
Return
--------
yV_pred: np.array(), mostly 1D
return prediction results.
"""
self.set_xy()
self.set_X()
self.set_A()
xM_d = self.xM_d
xM_p = self.xM_p
yV = self.yV
# A_d = self.A_d
A_2 = self.A_2
#Body
t = time()
yV_pred = jgrid.cv_BIKE_Ridge( [A_2], yV, alpha = alpha, XX = xM_p, n_folds = 20, n_jobs = n_jobs, grid_std = None)
print('Elasped time is', time() - t, 'sec')
return yV_pred
def plot( self):
sns.tsplot(data=self.pdo, time="alpha", unit="unit", condition="Method", value="r2")
plt.xscale('log')
plt.ylabel( r'$r^2$')
def _run_r0( self):
self.set_xy()
self.set_X()
self.set_A()
self.grid_search()
self.plot()
def run( self, SDx = False):
self.set_xy()
self.set_X()
self.set_A()
if SDx:
self.grid_search()
else:
self.grid_search_sd()
self.plot()
return self
def set_XA( self):
self.set_xy()
self.set_X()
self.set_A()
return self
#############################
# April 24, 2016
#############################
class MultiDK_DL( MultiDK):
"""
Deep learning version of MultiDK
Kernels are, however, not applied.
"""
def __init__( self, fname = 'sheet/wang3310_with_logP.csv', graph = False):
"""
alpha_l is fixed now.
"""
super().__init__( fname = fname)
self.graph = graph
def learning( self):
X = self.X
y = self.y
print( "Shape of X and y are", X.shape, y.shape)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
test_size=0.2, random_state=42)
val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val,
early_stopping_rounds=200)
model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000)
model.fit(X_train, y_train, val_monitor)
yP = model.predict(X_test)
score_r2 = metrics.r2_score(y_test, yP)
score_MedAE = metrics.median_absolute_error(y_test, yP)
print('Accuracy')
print('--------')
print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE))
if self.graph:
kutil.regress_show4( y_test, yP)
def _set_X_r0( self):
super().set_X()
print( [self.xM_d[key].shape for key in self.xM_d])
self.X = np.array( np.concatenate( list( self.xM_d.values()), axis = 1))
self.y = np.array( self.yV)
def set_X( self, ds_mode = "11111"):
"""
It combines only selected items by ds_mode
Each bit in a bit string indecate whether the coressponding descriptor
is included or not.
Default case, all descriptors will be aggreagated.
"""
self.set_X_M2()
#mode_l = [int(x) for x in ds_mode]
mode_key = ["MFP", "MACCS", "MolW2", "LASA", "logP"]
assert len(mode_key) == len(ds_mode)
X_l = list()
for idx in range( len(ds_mode)):
if ds_mode[ idx] == '1':
X_l.append( self.xM_d[ mode_key[ idx]])
self.X = np.array( np.concatenate( X_l, axis = 1))
self.y = np.array( self.yV)[:,0]
def run_mode( self, ds_mode):
self.set_X( ds_mode)
self.learning()
def run( self):
self.set_xy()
ds_mode_l = ["00111", "00110", "00101"]
for ds_mode in ds_mode_l:
print()
print("======================")
print("ds_mode:", ds_mode)
self.run_mode( ds_mode)
return self
#############################
# August 1, 2016
#############################
class MultiDK_DLA( MultiDK): # Underdevelopment
"""
use setA appropriately
I changed the base matrix as MultiDK instead of MultiDK_DL
since MultiDK is more simlar to the target class.
"""
def __init__( self, fname = 'sheet/wang3310_with_logP.csv'):
"""
- A will be used instead of A
- Need to change learning since training and testing should be
differently calculated for A.
"""
super().__init__( fname = fname)
def set_data(self):
self.set_xy()
self.set_X()
self.set_A()