-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot_domain_histogram.py
executable file
·80 lines (67 loc) · 2.17 KB
/
plot_domain_histogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
# # -*- coding: utf-8 -*-
"""
@Authors Max Tong & Huy Bui
Reads all the *.domains in the domain_info_dir and plot histogram of domain residues number
"""
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
import os, sys
import subprocess
script_dir=os.path.dirname(os.path.realpath(__file__))
# Syntax
def print_usage ():
print("usage: plot_domain_histogram.py domain_info_dir" )
print("usage: plot_domain_histogram.py domain_info" )
exit(0)
# Ensures that there are 1 arguments
if len(sys.argv) < 2:
print_usage()
else:
domain_info_dir = sys.argv[1]
# d is the array contains the number of residues per domain
d = []
# Read all the domain info file & calculate number of residues
for file in os.listdir(domain_info_dir):
if file.endswith(".domains"):
df = pd.read_csv(domain_info_dir + '/' + file, sep="\t", header=None)
domain_names = df[0]
domain_ranges = df[1]
for chainNo in range(len(domain_names)):
noResidues = 0
if ',' in domain_ranges[chainNo]:
ranges = [x for x in domain_ranges[chainNo].split(',') if x.strip()]
for x in ranges:
r = [int(x) for x in x.split('-') if x.strip()]
noResidues += r[1] - r[0]
else:
ranges = [int(x) for x in domain_ranges[chainNo].split('-') if x.strip()]
noResidues = ranges[1] - ranges[0]
print(file + " Chain " + str(chainNo) + " noRes " + str(noResidues))
d.append(noResidues)
# Visualization part
(mu, sigma) = norm.fit(d)
#print(mu)
#print(sigma)
#print(d)
# An "interface" to matplotlib.axes.Axes.hist() method
n, bins, patches = plt.hist(x=d, bins=range(min(d), max(d) + 20, 20), color='#0504aa',
alpha=0.7, rwidth=0.85)
#print(n)
#print(bins)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Domain size (aa)')
plt.ylabel('Number of Domains')
plt.title('Histogram of domain size')
# plt.text(23, 45, r'$\mu=15, b=3$')
maxfreq = n.max()
# Set a clean upper y-axis limit.
y = norm.pdf( bins, mu, sigma)
y = y*n.sum()*10
# plt.plot(bins, y, 'r--', linewidth=2)
#plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
#plt.xlim(0,800)
plt.show()