Skip to content

Commit

Permalink
Merge pull request #3 from l-mansouri/analysis
Browse files Browse the repository at this point in the history
Analysis Revision Update
  • Loading branch information
luisas authored Aug 23, 2024
2 parents ec93e05 + d65ea4a commit c0f970f
Show file tree
Hide file tree
Showing 29 changed files with 1,122 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ work
results
analysis/source_data
analysis/tables
analysis/plots
53 changes: 53 additions & 0 deletions analysis/00_extract_percid.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
aligners=c('mTMalign')
trimming=c('untrimmed')
source_data = '/home/luisasantus/Desktop/crg_cluster/projects/Phylo-IMD/analysis/source_data/'
plots = paste(source_data, '../plots/', sep = '')

fl=read.table(paste(source_data, 'list_of_families_with_all_rep_in_3d', sep = '/'))[,1]

folder_percid = '/home/luisasantus/Desktop/crg_cluster/projects/Phylo-IMD/results/mTMalign_percid/'

df = data.frame()
# iterate all files and extract line TOP
for (fam in fl){
print(fam)
file = paste(folder_percid, fam, '.txt', sep = '')
# read in txt file line by line
con <- file(file, "r")

# Read the file line by line
while(TRUE) {
line <- readLines(con, n = 1, warn = FALSE)

# Break the loop if we have reached the end of the file
if (length(line) == 0) {
break
}

# # Check if the line starts with "TOP"
if (grepl("^TOP", line)) {
# Split the line into columns based on whitespace
cols <- strsplit(line, "\\s+")[[1]]

# Extract the last three columns
extracted_cols <- cols[(length(cols)-2):length(cols)]
# add family name to the data frame
extracted_cols = c(extracted_cols, fam)

# Append the extracted columns to the data frame
df <- rbind(df, as.data.frame(t(extracted_cols), stringsAsFactors = FALSE))

}
# add family name to the data frame

}
# close the file
close(con)
}

# change column names
colnames(df) = c("seq1", "seq2", "percid", "family")
# make sure percid is numeric
df$percid = as.numeric(df$percid)
# save the data frame
write.table(df, paste(source_data, 'percids.txt', sep = ''), row.names = FALSE, quote = FALSE)
2 changes: 1 addition & 1 deletion analysis/03_prep_treelikeness_sourcedata.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,4 @@ for (al in aligners){
}

}
}
}
34 changes: 22 additions & 12 deletions analysis/04_treelikeness_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,27 @@ tr="untrimmed"

# Square everything
mtm = mtm^2
# keep only the columns of interest
mtm = mtm[,c("COR1dme", "CORtmme", "COR3dme")]
# save in souce_data
write.table(mtm, paste('source_data/Fig1A.csv', sep = ""), row.names = F, col.names = T)

median_seq = round(median(mtm$COR1dme),3)
median_imd = round(median(mtm$COR3dme),3)
median_tm = round(median(mtm$CORtmme),3)
label_seq = paste("Seq-ME \n (median: ",median_seq, ")", sep = "")
label_imd = paste("IMD-ME \n (median: ",median_imd, ")", sep = "")
label_tm = paste("TM-ME \n (median: ",median_tm, ")", sep = "")
label_seq = "Seq-ME"
label_tm = "TM-ME"
label_imd = "IMD-ME"
label_seq = paste("ME \n (median: ",median_seq, ")", sep = "")
label_imd = paste("IMD \n (median: ",median_imd, ")", sep = "")
label_tm = paste("TM \n (median: ",median_tm, ")", sep = "")
label_seq = "ME"
label_tm = "TM"
label_imd = "IMD"
df111=data.frame(correlations=c(mtm$COR1dme, mtm$CORtmme, mtm$COR3dme),
type=factor(rep(c(label_seq, label_tm, label_imd), each=length(mtm[,1])), levels=c(label_seq, label_tm, label_imd)))

palette = c("#27A592", "#2191FB", 158 )
palette = c("#A331D8","#27A592", "#2191FB" )
palette = c("#A331D8","#ffaf1c", "#2191FB" )
palette = c("#f8766d", "#A331D8", "#00ba39")
#palette = c(158,"#27A592", "#2191FB" )

col_seq = palette[1]
Expand All @@ -40,9 +46,9 @@ p_sat1=ggplot(df111, aes(x=type, y=correlations, fill=type, col = type, alpha =
geom_boxplot(outlier.size=0.1, lwd=0.6, linetype = "solid" )+
xlab('')+
ylab('R² (input distances, patristic distances)')+
annotate('text', x=df111$type[1], y=1.1, label=label_seq, vjust=1.7, col = col_seq, size = 6)+
annotate('text', x=df111$type[513], y=1.1, label=label_tm, vjust=1.7, col = col_tm, size = 6)+
annotate('text', x=df111$type[1025], y=1.1, label=label_imd, vjust=1.7, col = col_imd, size = 6)+
annotate('text', x=df111$type[1], y=1.15, label=label_seq, vjust=1.7, col = col_seq, size = 6)+
annotate('text', x=df111$type[513], y=1.15, label=label_tm, vjust=1.7, col = col_tm, size = 6)+
annotate('text', x=df111$type[1025], y=1.15, label=label_imd, vjust=1.7, col = col_imd, size = 6)+
ylim(0,1.19)+
scale_fill_manual(values=palette)+
scale_color_manual(values=palette)+
Expand All @@ -60,10 +66,14 @@ p_sat1=ggplot(df111, aes(x=type, y=correlations, fill=type, col = type, alpha =
geom_segment(aes(x = 0, xend = 0, y = 0, yend = 1), linetype = "solid", color = "black")+
annotate('text', x=df111$type[1], y=0.4, label=( paste("median =", median_seq)), vjust=1.7, col = "black", size = 3.5)+
annotate('text', x=df111$type[513], y=0.53, label=(paste("median =", median_tm)), vjust=1.7, col = "black", size = 3.5)+
annotate('text', x=df111$type[1025], y=0.53, label=(paste("median =", median_imd)), vjust=1.7, col = "black", size = 3.5)+
annotate('text', x=df111$type[1025], y=0.58, label=(paste("median =", median_imd)), vjust=1.7, col = "black", size = 3.5)+
theme(panel.background = element_rect(fill = "transparent", color = NA))+ theme(plot.background = element_rect(color = NA))


p_sat1 <- p_sat1 & plot_annotation(tag_levels = list("A"))
ggsave('plots/main/Fig1_correlation_input_patristic_mtmuntrimmed_508.png', plot=p_sat1, width = 6, height = 7, dpi = 300)


# mtm df columns COR1dme, CORtmme, COR3dme and rename as Seq-ME, TM-ME, IMD-ME
mtm = mtm[,c("COR1dme", "CORtmme", "COR3dme")]
colnames(mtm) = c("SeqME", "TMME", "IMDME")
Expand Down Expand Up @@ -92,8 +102,8 @@ plot_correlation_patristic <- function(al,tr, tag = ""){

median_seq = round(median(mtm$COR1dme),3)
median_imd = round(median(mtm$COR3dme),3)
label_seq = paste("Seq-ME \n (median: ",median_seq, ")", sep = "")
label_imd = paste("IMD-ME \n (median: ",median_imd, ")", sep = "")
label_seq = paste("ME \n (median: ",median_seq, ")", sep = "")
label_imd = paste("IMD \n (median: ",median_imd, ")", sep = "")

df111=data.frame(correlations=c(mtm$COR1dme, mtm$COR3dme),
type=factor(rep(c(label_seq, label_imd), each=length(mtm[,1])), levels=c(label_seq, label_imd)))
Expand Down
Loading

0 comments on commit c0f970f

Please sign in to comment.