Merge pull request #3 from l-mansouri/analysis

Analysis Revision Update
l-mansouri · Aug 23, 2024 · c0f970f · c0f970f
2 parents ec93e05 + d65ea4a
commit c0f970f
Show file tree

Hide file tree

Showing 29 changed files with 1,122 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ work
 results
 analysis/source_data
 analysis/tables
+analysis/plots
diff --git a/analysis/00_extract_percid.R b/analysis/00_extract_percid.R
@@ -0,0 +1,53 @@
+aligners=c('mTMalign')
+trimming=c('untrimmed')
+source_data = '/home/luisasantus/Desktop/crg_cluster/projects/Phylo-IMD/analysis/source_data/'
+plots = paste(source_data, '../plots/', sep = '')
+
+fl=read.table(paste(source_data, 'list_of_families_with_all_rep_in_3d', sep = '/'))[,1]
+
+folder_percid = '/home/luisasantus/Desktop/crg_cluster/projects/Phylo-IMD/results/mTMalign_percid/'
+
+df = data.frame()
+# iterate all files and extract line TOP 
+for (fam in fl){
+    print(fam)
+    file = paste(folder_percid, fam, '.txt', sep = '')
+    # read in txt file line by line 
+    con <- file(file, "r")
+
+    # Read the file line by line
+    while(TRUE) {
+    line <- readLines(con, n = 1, warn = FALSE)
+
+    # Break the loop if we have reached the end of the file
+    if (length(line) == 0) {
+      break
+    }
+
+    # # Check if the line starts with "TOP"
+    if (grepl("^TOP", line)) {
+        # Split the line into columns based on whitespace
+        cols <- strsplit(line, "\\s+")[[1]]
+
+        # Extract the last three columns
+        extracted_cols <- cols[(length(cols)-2):length(cols)]
+        # add family name to the data frame
+        extracted_cols = c(extracted_cols, fam)
+
+        # Append the extracted columns to the data frame
+        df <- rbind(df, as.data.frame(t(extracted_cols), stringsAsFactors = FALSE))
+
+    }
+    # add family name to the data frame
+
+    }
+    # close the file
+    close(con)
+}
+
+# change column names
+colnames(df) = c("seq1", "seq2", "percid", "family")
+# make sure percid is numeric
+df$percid = as.numeric(df$percid)
+# save the data frame
+write.table(df, paste(source_data, 'percids.txt', sep = ''), row.names = FALSE, quote = FALSE)
diff --git a/analysis/03_prep_treelikeness_sourcedata.R b/analysis/03_prep_treelikeness_sourcedata.R
@@ -153,4 +153,4 @@ for (al in aligners){
         }
 
     }
-}
+}
diff --git a/analysis/04_treelikeness_plots.R b/analysis/04_treelikeness_plots.R
@@ -13,21 +13,27 @@ tr="untrimmed"
 
 # Square everything 
 mtm = mtm^2
+# keep only the columns of interest
+mtm = mtm[,c("COR1dme", "CORtmme", "COR3dme")]
+# save in souce_data
+write.table(mtm, paste('source_data/Fig1A.csv', sep = ""), row.names = F, col.names = T)
 
 median_seq = round(median(mtm$COR1dme),3)
 median_imd = round(median(mtm$COR3dme),3)
 median_tm  = round(median(mtm$CORtmme),3)
-label_seq  = paste("Seq-ME \n (median: ",median_seq, ")", sep = "")
-label_imd  = paste("IMD-ME \n (median: ",median_imd, ")", sep = "")
-label_tm  = paste("TM-ME \n (median: ",median_tm, ")", sep = "")
-label_seq = "Seq-ME"
-label_tm = "TM-ME"
-label_imd = "IMD-ME"
+label_seq  = paste("ME \n (median: ",median_seq, ")", sep = "")
+label_imd  = paste("IMD \n (median: ",median_imd, ")", sep = "")
+label_tm  = paste("TM \n (median: ",median_tm, ")", sep = "")
+label_seq = "ME"
+label_tm = "TM"
+label_imd = "IMD"
 df111=data.frame(correlations=c(mtm$COR1dme, mtm$CORtmme, mtm$COR3dme),
     type=factor(rep(c(label_seq, label_tm, label_imd), each=length(mtm[,1])), levels=c(label_seq, label_tm, label_imd)))
 
 palette = c("#27A592", "#2191FB", 158 )
 palette = c("#A331D8","#27A592", "#2191FB" )
+palette = c("#A331D8","#ffaf1c", "#2191FB" )
+palette = c("#f8766d", "#A331D8", "#00ba39")
 #palette = c(158,"#27A592", "#2191FB" )
 
 col_seq = palette[1]
@@ -40,9 +46,9 @@ p_sat1=ggplot(df111, aes(x=type, y=correlations, fill=type, col = type, alpha =
     geom_boxplot(outlier.size=0.1, lwd=0.6, linetype = "solid" )+
     xlab('')+
     ylab('R² (input distances, patristic distances)')+
-    annotate('text', x=df111$type[1], y=1.1, label=label_seq, vjust=1.7, col = col_seq, size = 6)+
-    annotate('text', x=df111$type[513], y=1.1, label=label_tm, vjust=1.7, col = col_tm, size = 6)+
-    annotate('text', x=df111$type[1025], y=1.1, label=label_imd, vjust=1.7, col = col_imd, size = 6)+
+    annotate('text', x=df111$type[1], y=1.15, label=label_seq, vjust=1.7, col = col_seq, size = 6)+
+    annotate('text', x=df111$type[513], y=1.15, label=label_tm, vjust=1.7, col = col_tm, size = 6)+
+    annotate('text', x=df111$type[1025], y=1.15, label=label_imd, vjust=1.7, col = col_imd, size = 6)+
     ylim(0,1.19)+
     scale_fill_manual(values=palette)+
     scale_color_manual(values=palette)+
@@ -60,10 +66,14 @@ p_sat1=ggplot(df111, aes(x=type, y=correlations, fill=type, col = type, alpha =
     geom_segment(aes(x = 0, xend = 0, y = 0, yend = 1), linetype = "solid", color = "black")+
     annotate('text', x=df111$type[1], y=0.4, label=( paste("median =", median_seq)), vjust=1.7, col = "black", size = 3.5)+
     annotate('text', x=df111$type[513], y=0.53, label=(paste("median =", median_tm)), vjust=1.7, col = "black", size = 3.5)+
-    annotate('text', x=df111$type[1025], y=0.53, label=(paste("median =", median_imd)), vjust=1.7, col = "black", size = 3.5)+
+    annotate('text', x=df111$type[1025], y=0.58, label=(paste("median =", median_imd)), vjust=1.7, col = "black", size = 3.5)+
   theme(panel.background = element_rect(fill = "transparent", color = NA))+ theme(plot.background = element_rect(color = NA))
+
+
+p_sat1 <- p_sat1 & plot_annotation(tag_levels = list("A"))
 ggsave('plots/main/Fig1_correlation_input_patristic_mtmuntrimmed_508.png', plot=p_sat1, width = 6, height = 7, dpi = 300)
 
+
 # mtm df columns COR1dme, CORtmme, COR3dme and rename as Seq-ME, TM-ME, IMD-ME
 mtm = mtm[,c("COR1dme", "CORtmme", "COR3dme")]
 colnames(mtm) = c("SeqME", "TMME", "IMDME")
@@ -92,8 +102,8 @@ plot_correlation_patristic <- function(al,tr, tag = ""){
 
   median_seq = round(median(mtm$COR1dme),3)
   median_imd = round(median(mtm$COR3dme),3)
-  label_seq  = paste("Seq-ME \n (median: ",median_seq, ")", sep = "")
-  label_imd  = paste("IMD-ME \n (median: ",median_imd, ")", sep = "")
+  label_seq  = paste("ME \n (median: ",median_seq, ")", sep = "")
+  label_imd  = paste("IMD \n (median: ",median_imd, ")", sep = "")
 
   df111=data.frame(correlations=c(mtm$COR1dme, mtm$COR3dme),
                    type=factor(rep(c(label_seq, label_imd), each=length(mtm[,1])), levels=c(label_seq, label_imd)))
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,4 +153,4 @@ for (al in aligners){ @@
             }
         }
-    }
+    }