-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcalc_stat.py
76 lines (62 loc) · 3.28 KB
/
calc_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
#==============================================================================
# author : Pavel Polishchuk
# date : 15-02-2019
# version :
# python_version :
# copyright : Pavel Polishchuk 2019
# license :
#==============================================================================
import argparse
import pandas as pd
import numpy as np
import warnings
from forest import hit_rate, enrichment
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Calculate statistics.')
parser.add_argument('-y', metavar='activity.txt', required=True,
help='text file with activity values 0/1/NA (tab-separated).'
'Header is present. The first column contains compound names.')
parser.add_argument('-p', '--predictions', metavar='predictions.txt', required=True,
help='text file with predictions returned by forest_predict.py.')
parser.add_argument('-s', '--overall_stat', metavar='overall_stat.txt', required=False, default=None,
help='text file with calculated statistics. Default: None.')
parser.add_argument('-a', '--assay_stat', metavar='assay_stat.txt', required=False, default=None,
help='text file with calculated statistics for each assay. Default: None.')
parser.add_argument('-t', '--threshold', metavar='NUMBER', required=False, default=1,
help='floating point number which defines the threshold of compounds to select. Default: 1.')
args = vars(parser.parse_args())
for o, v in args.items():
if o == "y": y_fname = v
if o == "predictions": pred_fname = v
if o == "overall_stat": stat_fname = v
if o == "assay_stat": astat_fname = v
if o == "threshold": thr = float(v)
if stat_fname is None and astat_fname is None:
raise ValueError('at least one of outputs should be specified: overall_stat or assay_stat.')
y = pd.read_table(y_fname, sep="\t", index_col=0)
ref_hit_rate = np.apply_along_axis(hit_rate, 0, y)
pred = pd.read_table(pred_fname, sep="\t", index_col=0)
if stat_fname:
f_stat = open(stat_fname, 'wt')
if astat_fname:
f_astat = open(astat_fname, 'wt')
for j in range(pred.shape[1]):
ids = pred.iloc[:, j] >= thr
if stat_fname:
if j == 0:
f_stat.write('tree\tcompounds\tcoverage\tmedian enrichment\tmean enrichment\n')
e_median = enrichment(y.loc[ids, :], ref_hit_rate, np.nanmedian)
e_mean = enrichment(y.loc[ids, :], ref_hit_rate, np.nanmean)
f_stat.write('\t'.join(map(str, (pred.columns[j], sum(ids), round(sum(ids) / y.shape[0], 3), round(e_median, 3), round(e_mean, 3)))) + '\n')
if astat_fname:
if j == 0:
f_astat.write('tree\t' + '\t'.join(y.columns) + '\n')
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
e_assay = np.apply_along_axis(hit_rate, 0, y.loc[ids, :]) / ref_hit_rate
f_astat.write(str(pred.columns[j]) + '\t' + '\t'.join(map(str, np.round(e_assay, 3))) + '\n')
if stat_fname:
f_stat.close()
if astat_fname:
f_astat.close()