-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcombine_busco_table_results.py
executable file
·99 lines (83 loc) · 3.63 KB
/
combine_busco_table_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
#
# combine_busco_table_results.py created 2023-01-30
"""combine_busco_table_results.py last modified 2023-01-30
./combine_busco_table_results.py -s species_to_dir_list.tab > group_combined_busco_table.tsv
species list is tab delimited of species and directory, like:
Aphrocallistes_vastus Avas.v1.29_annotations.prot.fr.fasta.busco5
Amphimedon_queenslandica Aqu2.1_Genes_proteins.fasta.busco5
Ephydatia_muelleri Emu_augustus_sysnames_prots.fasta.busco5
Oopsacas_minuta JAKMXF01.1.gbff.prot.fa.busco5
"""
import sys
import os
import time
import argparse
from collections import defaultdict
def get_species_list(species_file):
"""read species list file and return list in order"""
species_list = []
main_dir_list = []
sys.stderr.write("# Reading species list from {} {}\n".format( species_file, time.asctime() ) )
for line in open(species_file,'r'):
line = line.strip()
if line:
lsplits = line.split("\t")
if len(lsplits) != 2:
sys.stderr.write("# ERROR: line does not have two columns\n{}\n".format( line ) )
species_list.append(lsplits[0])
main_dir_list.append(lsplits[1])
sys.stderr.write("# Found {} species {}\n".format( len(species_list), time.asctime() ) )
return species_list, main_dir_list
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-s','--species-list', help="tab-delimited file of species and BUSCO directories, one species per line, folders should be named XX.busco5 or similar")
args = parser.parse_args(argv)
species_list, busco_dir_list = get_species_list(args.species_list)
genes_by_species = defaultdict( lambda: defaultdict(list) ) # key is species, then key is gene ID, then value is completeness
gene_order = [] # recorded once
species_count = 0
for species, busco_dir in zip(species_list,busco_dir_list):
full_busco_path = os.path.join(busco_dir, "run_metazoa_odb10", "full_table.tsv")
species_count += 1
sys.stderr.write("# Reading result table {} files for species {} {}\n".format( full_busco_path, species, time.asctime() ) )
# BUSCO version is: 5.4.2
# The lineage dataset is: metazoa_odb10 (Creation date: 2021-02-17, number of genomes: 65, number of BUSCOs: 954)
# Busco id Status Sequence Score Length
#5951at33208 Complete Aqu2.1.29871_001 4199.2 2006
#9639at33208 Complete Aqu2.1.41859_001 2346.4 1766
#13816at33208 Complete Aqu2.1.36111_001 1509.6 1450
for line in open(full_busco_path, 'r'):
line = line.strip()
if line and line[0]!="#": # skip comments
lsplits = line.split("\t")
gene_id = lsplits[0]
if species_count < 2:
gene_order.append(gene_id)
found_status = lsplits[1]
if found_status=="Complete":
found_score = "1"
elif found_status=="Duplicated":
found_score = "100"
elif found_status=="Fragmented":
found_score = "0.01"
elif found_status=="Missing":
found_score = "0"
else: # should never happen
found_score = "-1"
genes_by_species[species][gene_id] = found_score
output_header = "gene_id\t{}\n".format( "\t".join( species_list ) )
sys.stdout.write( output_header )
write_count = 0
for gene_id in gene_order:
out_scores_list = []
for species in species_list:
out_score = genes_by_species.get(species).get(gene_id,"NA")
out_scores_list.append(out_score)
output_line = "{}\t{}\n".format( gene_id, "\t".join( out_scores_list ) )
write_count += 1
sys.stderr.write("# Wrote {} genes for {} species {}\n".format( write_count, len(species_list), time.asctime() ) )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)