-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathplot_homolog_output_logs.R
167 lines (140 loc) · 7.86 KB
/
plot_homolog_output_logs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# plot homolog output logs
#
# last modified 2022-12-02
#
# based on output files from makehomologs.py
# https://bitbucket.org/wrf/sequences/src/master/makehomologs.py
#
# R script can be run as:
# Rscript plot_homolog_output_logs.R something.mh.log fasta_clusters.something.tab
#
# PDF will be automatically generated
args = commandArgs(trailingOnly=TRUE)
input_logfile = args[1]
#input_logfile = "~/project/lactobacillus_genomics/lactobacillus_v3.2022-02-28-223648.mh.log"
outputfile = gsub("([\\w/]+)\\....$","\\1.pdf",input_logfile,perl=TRUE)
# read log, and then parse subparts of the log file
print(paste("Reading log file", input_logfile, ", writing to", outputfile))
log_data = read.table(input_logfile, stringsAsFactors = FALSE,skip=2, header=FALSE, sep="\t")
clustcounts = log_data[log_data$V1=="N_seqs",]
taxacounts = log_data[log_data$V1=="N_taxa",]
genes_in_clusters_by_sp = log_data[log_data$V1=="N_genes",]
ortho_counts_by_sp = log_data[log_data$V1=="N_ortho",]
# these might be empty is blast processing step was skipped
total_prots_by_sp = log_data[log_data$V1=="T_genes",]
blast_hits_by_sp = log_data[log_data$V1=="N_bhits",]
prots_no_hits = as.integer(total_prots_by_sp$V3) - as.integer(blast_hits_by_sp$V3)
hits_no_clusters = as.integer(blast_hits_by_sp$V3) - as.integer(genes_in_clusters_by_sp$V3)
hits_w_clusters = as.integer(genes_in_clusters_by_sp$V3) - as.integer(ortho_counts_by_sp$V3)
ortho_clusters = as.integer(ortho_counts_by_sp$V3)
print( paste(sum(as.integer(total_prots_by_sp$V3)),"total proteins in the analysis") )
print( paste(sum(prots_no_hits), "proteins had no blast hits" ) )
#
input_cluster_file = args[2]
#input_cluster_file = "~/project/lactobacillus_genomics/fasta_clusters.lactobacillus_v3.tab"
print(paste("Reading orthogroups from", input_cluster_file, ", writing to", outputfile))
# should autodetect if makehomologs.py used the -c option, making the header fasta_clusters.H.
cf_skip = grepl("fasta_clusters.H.", input_cluster_file)
# meaning skip the header line
cluster_data = read.table(input_cluster_file, header=cf_skip, sep="\t", stringsAsFactors = FALSE )
num_sequences = as.integer(cluster_data[,2])
num_taxa = as.integer(cluster_data[,3])
min_per_taxon = as.integer(cluster_data[,4])
med_per_taxon = as.integer(cluster_data[,5])
max_per_taxon = as.integer(cluster_data[,6])
# get vector for 1-1 orthologs
is_avg_one = (num_sequences==num_taxa)
ortho_only_sizes = table(num_sequences[is_avg_one])
# make PDF
print("Making PDF")
pdf(file=outputfile, width=8, height=11)
# page 1
par(mfrow=c(2,1), mar=c(4.5,4.5,2.5,1))
# upper plot
#first_xlim = min(rev(as.integer(clustcounts$V2))[cumsum(as.integer(rev(clustcounts$V3))) > 0.995*sum(as.integer(rev(clustcounts$V3)))])
plot(clustcounts$V2, clustcounts$V3, type="n", xlab="Cluster size", ylab="Number of clusters",
main=paste("Sizes of",sum(as.integer(clustcounts$V3)),"candidate clusters"), # xlim=c(1,first_xlim),
cex.axis=1.4, cex.lab=1.4 )
lines(clustcounts$V2, clustcounts$V3 , lwd=5, col="#016c5999" )
lines(taxacounts$V2, taxacounts$V3 , lwd=5, col="#fc4e2a99" )
lines( as.integer(unlist(dimnames(ortho_only_sizes))), ortho_only_sizes , lwd=5, col="#0000b399")
legend( "topright", legend=c("N sequences", "N taxa", "N orthologs"),
col=c("#016c59","#fc4e2a", "#0000b3"), lwd=8, cex=1.3 )
# lower plot
species_count = nrow(genes_in_clusters_by_sp)
bar_las = ifelse( species_count < 35, 3, 0)
if (species_count < 35){
barplot_names = genes_in_clusters_by_sp$V2
par(mar=c(8,4.5,2.5,1))
bordcolor = "#000000"
} else {
barplot_names = NULL
par(mar=c(3,4.5,2.5,1))
bordcolor = NA
}
if ( nrow(total_prots_by_sp)==nrow(genes_in_clusters_by_sp) ) {
matrix_for_bars = t(as.matrix(data.frame( ortho_clusters, hits_w_clusters , hits_no_clusters , prots_no_hits )))
barplot( matrix_for_bars , ylab="Number of genes in clusters",
main=paste("Total counts of prots, those with hits, those in orthogroups for",species_count,"species"),
col=c("#0000b3", "#9ecae1","#8c6bb1","#7f2704"), border = bordcolor,
cex.axis=1.4, cex.lab=1.4 , names.arg=barplot_names, las=bar_las)
# here could also go on bottom, but often better fits on top of plot
legend(x="top", legend=c("Single-copy", "Multi-copy", "Not clustered", "No blast"),
col=c("#0000b3", "#9ecae1","#8c6bb1","#7f2704"), pch=15, pt.cex=2, horiz=TRUE, xpd = TRUE )
} else {
barplot( as.integer(genes_in_clusters_by_sp$V3) , ylab="Number of genes in clusters",
main=paste("Total counts of genes in orthogroups for",species_count,"species"),
col="#9ecae1", border = bordcolor,
cex.axis=1.4, cex.lab=1.4 , names.arg=barplot_names, las=bar_las)
}
# end page 1
# page 2
par(mar=c(4.5,4.5,2.5,1))
# upper plot
point_color_vec = ifelse(is_avg_one,"#0000b344","#ed627844")
plot(num_sequences, num_taxa, xlab="Total sequences in cluster", ylab="Number of taxa in cluster",
main=paste("Total sequences and taxon counts for",nrow(cluster_data),"orthogroups"),
cex.axis=1.4, cex.lab=1.4, #xlim=c(0,30), ylim=c(0,12),
cex=2, col=point_color_vec, pch=16)
abline(v=min(num_sequences), lwd=1, lty=2, col="#00000044" ) # limit of -z
text( max(num_sequences)*0.08, max(num_taxa), ":", cex=1.3)
text( max(num_sequences)*0.09, max(num_taxa), table(is_avg_one)[["TRUE"]], adj=0, col="#0000b3")
text( max(num_sequences)*0.07, max(num_taxa), table(is_avg_one)[["FALSE"]], adj=1, col="#ed6278")
# orthologs are blue, paralogs are orange
point_color_vec = ifelse(is_avg_one,"#0000b344","#ec701444")
average_seqs_per_sp = num_sequences/num_taxa
plot(num_sequences, average_seqs_per_sp, xlab="Total sequences in cluster", ylab="Average seqs per species",
main=paste("Average number of seqs per taxon of",nrow(cluster_data),"orthogroups"),
cex.axis=1.4, cex.lab=1.4, #xlim=c(0,30), ylim=c(0,6),
cex=2, col=point_color_vec, pch=16)
abline(a=0, b=1/species_count , lwd=1, lty=2, col="#00000044" ) # limit of number of species
abline(a=0, b=1/min(num_taxa) , lwd=1, lty=2, col="#00000044" ) # limit of -s , minimum taxa per cluster
text( max(num_sequences)*0.08, max(average_seqs_per_sp), ":", cex=1.3)
text( max(num_sequences)*0.09, max(average_seqs_per_sp), table(is_avg_one)[["TRUE"]], adj=0, col="#0000b3")
text( max(num_sequences)*0.07, max(average_seqs_per_sp), table(is_avg_one)[["FALSE"]], adj=1, col="#ec7014")
# lower plot
# change color of paralogs to pink
point_color_vec = ifelse(is_avg_one,"#0000b344","#dd349744")
plot(num_sequences, max_per_taxon, xlab="Total sequences in cluster", ylab="Most seqs in a species",
main=paste("Most seqs from a single taxon for",nrow(cluster_data),"orthogroups"),
cex.axis=1.4, cex.lab=1.4, #xlim=c(0,30), ylim=c(0,12),
cex=2, col=point_color_vec, pch=16)
abline(a=1-min(num_taxa), b=1, lwd=1, lty=2, col="#00000044" ) # limit of -s
abline(v=min(num_sequences), lwd=1, lty=2, col="#00000044" ) # vertical line limit of -z
text( max(num_sequences)*0.08, max(max_per_taxon), ":", cex=1.3)
text( max(num_sequences)*0.09, max(max_per_taxon), table(is_avg_one)[["TRUE"]], adj=0, col="#0000b3")
text( max(num_sequences)*0.07, max(max_per_taxon), table(is_avg_one)[["FALSE"]], adj=1, col="#dd3497")
# change color of paralogs to orange
has_more_than_two = (num_taxa > 2) # exclude clusters with only 2 taxa, as median is meaningless
median_vs_min = (med_per_taxon - min_per_taxon)[has_more_than_two]
max_vs_median = (max_per_taxon - med_per_taxon)[has_more_than_two]
point_color_vec = ifelse(is_avg_one,"#0000b344","#dd973444")[has_more_than_two]
plot( jitter(median_vs_min), jitter(max_vs_median), xlab="Median minus minimum", ylab="Maximum minus median",
main=paste("Relative differences of min-med and med-max for",sum(has_more_than_two),"orthogroups"),
cex.axis=1.4, cex.lab=1.4, #xlim=c(0,30), ylim=c(0,12),
cex=2, col=point_color_vec, pch=16)
#d = data.frame(num_sequences, max_per_taxon)
#image(as.matrix(log10(table(d))), col=terrain.colors(10,rev = TRUE))
dev.off()
print("Done")
#