-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_scaffold_peptide_summary.py
101 lines (65 loc) · 2.34 KB
/
parse_scaffold_peptide_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
name = 'parse_scaffolds_peptide_summary.py'
updated = '2023-08-02'
version = '0.1.0'
def parse_scaffold_peptide_summary(peptide_report=False,outdir="PARSED_REPORT"):
from os.path import isdir,isfile
from os import makedirs
## Verify that file is provided
if not peptide_report or not isfile(peptide_report):
print(f"\n [E] peptide_report ({peptide_report}) is not accessible. Be sure the file provided exists. Exiting...\n")
exit()
grab = False
REPORT = open(report,'r')
data = {}
accessions = {}
sample = ""
for count,line in enumerate(REPORT):
line = line.strip()
if grab:
if line != "" and len(line.split("\t")) > 1:
temp = line.split("\t")
sample = temp[1]
accession = temp[5].split(",")[0]
id_prob = float(temp[9].replace("%",""))
seq = temp[15]
if id_prob > 80:
accessions[accession] = True
if sample not in data.keys():
data[sample] = {}
if accession not in data[sample].keys():
data[sample][accession] = []
data[sample][accession].append(seq)
if grab == False and len(line.split("\t")) == 37:
grab = True
REPORT.close()
if not isdir(outdir):
makedirs(outdir,mode=0o755)
MASTER_ACCESSION = open(f"{outdir}/accessions.list",'w')
for accession in sorted(accessions.keys()):
MASTER_ACCESSION.write(f"{accession}\n")
MASTER_ACCESSION.close()
for sample in data.keys():
sample_name = sample.replace(" ","_")
temp_dir = f"{outdir}/{sample_name}"
if not isdir(temp_dir):
makedirs(temp_dir,mode=0o755)
if not isdir(f"{temp_dir}/sequences"):
makedirs(f"{temp_dir}/sequences",mode=0o755)
ACCESSIONS = open(f"{temp_dir}/accessions.list",'w')
for accession in sorted(data[sample].keys()):
ACCESSIONS.write(f"{accession}\n")
SEQ = open(f"{temp_dir}/sequences/{accession}.faa",'w')
for count,seq in enumerate(sorted(data[sample][accession])):
SEQ.write(f">{accession}:{count+1}\n{seq}\n")
SEQ.close()
ACCESSIONS.close()
if __name__ == "__main__":
from argparse import ArgumentParser
GetOptions = ArgumentParser()
GetOptions.add_argument("-p","--peptide_report",required=True)
GetOptions.add_argument("-o","--outdir",default="PARSED_REPORT")
args = GetOptions.parse_known_args()[0]
report = args.peptide_report
outdir = args.outdir
parse_scaffold_peptide_summary(report,outdir)