From ca1a27f2ec97c1768bd4d7884f51d87146bfdb95 Mon Sep 17 00:00:00 2001 From: Brian Palmer Date: Tue, 7 Mar 2023 10:19:12 -0800 Subject: [PATCH] paused append number of mapped amplicons to the sample coverage file (#69) sample coverage work (#69) --- R_code/postdada_rearrange.R | 95 ++++++++++++++++++++++++------------- main.nf | 19 ++++---- 2 files changed, 71 insertions(+), 43 deletions(-) diff --git a/R_code/postdada_rearrange.R b/R_code/postdada_rearrange.R index 0c33361..89ff73b 100644 --- a/R_code/postdada_rearrange.R +++ b/R_code/postdada_rearrange.R @@ -8,7 +8,8 @@ parser$add_argument('--masked-fasta', type="character") parser$add_argument('--dada2-output', type="character", required = TRUE) parser$add_argument('--alignment-threshold', type="integer", default = 60) parser$add_argument('--parallel', action='store_true') -parser$add_argument('--n-cores', type = 'integer', default = -1) +parser$add_argument('--n-cores', type = 'integer', default = -1, help = "Number of cores to use. Ignored if running parallel flag is unset.") +parser$add_argument('--sample-coverage', type="character", help = "Sample coverage file from QC to append sample coverage statistics that are P. falciparum specific.") args <- parser$parse_args() print(args) @@ -22,19 +23,21 @@ library(muscle) library(BSgenome) library(tidyr) library(doMC) - +library(tibble) ## Postprocessing QC +### Print allele table before masking (for debugging) + seqtab.nochim <- readRDS(args$dada2_output) seqtab.nochim.df = as.data.frame(seqtab.nochim) seqtab.nochim.df$sample = rownames(seqtab.nochim) seqtab.nochim.df[seqtab.nochim.df==0]=NA pat="-1A_|-1B_|-1_|-2_|-1AB_|-1B2_" -seqtab.nochim.df = seqtab.nochim.df %>% - pivot_longer(cols = seq(1,ncol(seqtab.nochim)),names_to = "asv",values_to = "reads",values_drop_na=TRUE) %>% - mutate(locus = paste0(sapply(strsplit(sample,"_"),"[",1),"_",sapply(strsplit(sample,"_"),"[",2),"_",sapply(strsplit(sample,"_"),"[",3)))%>% - mutate(sampleID = sapply(strsplit(sapply(strsplit(sample,pat),"[",2),"_trimmed"),"[",1)) %>% +seqtab.nochim.df = seqtab.nochim.df %>% + pivot_longer(cols = seq(1,ncol(seqtab.nochim)),names_to = "asv",values_to = "reads",values_drop_na=TRUE) %>% + mutate(locus = paste0(sapply(strsplit(sample,"_"),"[",1),"_",sapply(strsplit(sample,"_"),"[",2),"_",sapply(strsplit(sample,"_"),"[",3)))%>% + mutate(sampleID = sapply(strsplit(sapply(strsplit(sample,pat),"[",2),"_trimmed"),"[",1)) %>% select(sampleID,locus,asv,reads) temp = seqtab.nochim.df %>% select(locus,asv) %>% distinct() @@ -42,7 +45,7 @@ loci =unique(temp$locus) k=1 allele.sequences = data.frame(locus = seq(1,nrow(temp)),allele = seq(1,nrow(temp)),sequence = seq(1,nrow(temp))) for(i in seq(1,length(loci))){ - temp2 = temp %>% filter(locus==loci[i]) + temp2 = temp %>% filter(locus==loci[i]) for(j in seq(1,nrow(temp2))){ allele.sequences$locus[k+j-1] = loci[i] allele.sequences$allele[k+j-1] = paste0(loci[i],".",j) @@ -51,12 +54,12 @@ for(i in seq(1,length(loci))){ k=k+nrow(temp2) } -allele.data = seqtab.nochim.df %>% - left_join(allele.sequences %>% select(-locus),by=c("asv"="sequence")) %>% +allele.data = seqtab.nochim.df %>% + left_join(allele.sequences %>% select(-locus),by=c("asv"="sequence")) %>% group_by(sampleID,locus,allele) %>% - mutate(norm.reads.allele = reads/sum(reads))%>% + mutate(norm.reads.allele = reads/sum(reads))%>% group_by(sampleID,locus) %>% - mutate(norm.reads.locus = reads/sum(reads))%>% + mutate(norm.reads.locus = reads/sum(reads))%>% mutate(n.alleles = n()) saveRDS(allele.data,file="pre_processed_allele_table.RDS") @@ -170,12 +173,6 @@ if (length(non_overlaps_idx) > 0) { combined <- c(combined, paste(c(df_non_overlap[idx, ]$processed_left_sequences, df_non_overlap[idx, ]$processed_right_sequences), collapse = "")) } df_non_overlap$combined <- combined - # df_non_overlap, - # combined, - # c( - # processed_left_sequences, - # processed_right_sequences - # ), sep="+") saveRDS(df_non_overlap,file="non_overlapping_seqs.RDS") write.table(df_non_overlap,file="non_overlapping_seqs.txt",quote=F,sep="\t",col.names=T,row.names=F) @@ -342,22 +339,33 @@ if (!is.null(args$homopolymer_threshold) && args$homopolymer_threshold > 0) { seqtab.nochim.df$original <- base::rownames(seqtab.nochim.df) df_seqs <- inner_join(df_seqs, seqtab.nochim.df, by = "original") - seqtab.nochim.df <- df_seqs %>% - select(-c(original, hapseq, refseq, refid, score, indels)) %>% - distinct() %>% - pivot_longer( - cols = -c(asv_prime), - names_to = "sample", - values_to = "counts" - ) %>% - group_by(sample, asv_prime) %>% - summarise(counts = sum(counts)) %>% - pivot_wider(names_from = asv_prime, values_from = counts) %>% - # filter(counts != 0) %>% - ungroup() + # seqtab.nochim.df <- df_seqs %>% + # select(-c(original, hapseq, refseq, refid, score, indels)) %>% + # distinct() %>% + # pivot_longer( + # cols = -c(asv_prime), + # names_to = "sample", + # values_to = "counts" + # ) %>% + # group_by(sample, asv_prime) %>% + # summarise(counts = sum(counts)) %>% + # pivot_wider(names_from = asv_prime, values_from = counts) %>% + # # filter(counts != 0) %>% + # ungroup() + + seqtab.nochim.df <- df_seqs %>% + group_by(refid, asv_prime) %>% + summarise(across(-c(original, hapseq, refseq, score, indels), sum)) %>% + ungroup() %>% + select(-c(refid)) + + seqtab.nochim.df <- column_to_rownames(seqtab.nochim.df, var = "asv_prime") + seqtab.nochim.df <- as.data.frame(t(seqtab.nochim.df)) + seqtab.nochim.df$sample = rownames(seqtab.nochim.df) seqtab.nochim.df[seqtab.nochim.df==0]=NA + } else { seqtab.nochim.df = as.data.frame(seqtab.nochim) seqtab.nochim.df$sample = rownames(seqtab.nochim) @@ -398,11 +406,30 @@ allele.data = seqtab.nochim.df %>% saveRDS(allele.data,file="allele_data.RDS") write.table(allele.data,file="allele_data.txt",quote=F,sep="\t",col.names=T,row.names=F) +## QC Postprocessing + +if (!is.null(args$sample_coverage) && !file.exists(args$sample_coverage)) { + sample.coverage <- read.table(args$sample_coverage, header = FALSE, sep = "\t") %>% + pivot_wider(names_from = V2, values_from = V3) + + qc.postproc <- allele.data %>% + left_join(sample.coverage, by = c("sampleID" = "V1")) %>% + group_by(sampleID) %>% + select(sampleID, Input, `No Dimers`, Amplicons, reads) %>% + group_by(sampleID) %>% + mutate(Amplicons.Pf = sum(reads)) %>% + select(-c(reads)) %>% + distinct() %>% + pivot_longer(cols = c(Input, `No Dimers`, Amplicons, Amplicons.Pf)) + + write.table(qc.postproc, quote=F,sep='\t',col.names = F, row.names = F, file = args$sample_coverage) +} + # get memory footprint of environment -# out <- as.data.frame(sort( sapply(ls(),function(x){object.size(get(x))}))) -# colnames(out) <- "bytes" -# out <- out %>% dplyr::mutate(MB = bytes / 1e6, GB = bytes / 1e9) -# write.csv(out, "postproc_memory_profile.csv") +out <- as.data.frame(sort( sapply(ls(),function(x){object.size(get(x))}))) +colnames(out) <- "bytes" +out <- out %>% dplyr::mutate(MB = bytes / 1e6, GB = bytes / 1e9) +write.csv(out, "postproc_memory_profile.csv") # saveRDS(df_seqs, file = "df_seqs.RDS") # saveRDS(df_final, file = "df_final.RDS") diff --git a/main.nf b/main.nf index 70dbf20..c3bebae 100644 --- a/main.nf +++ b/main.nf @@ -41,13 +41,12 @@ workflow { // All workflows will produce a QC report (cutadapt needed by QC for length statistics) CUTADAPT(read_pairs_ch, params.amplicon_info, params.cutadapt_minlen, qualfilter) - // QUALITY_CHECK(read_pairs_ch, CUTADAPT.out[1].collect(), params.amplicon_info) + QUALITY_CHECK(read_pairs_ch, CUTADAPT.out[1].collect(), params.amplicon_info) if (params.QC_only == false) { DADA2_ANALYSIS(CUTADAPT.out[0].collect(), params.amplicon_info) - // Create reference sequences from genome if (params.refseq_fasta == null) { @@ -59,7 +58,7 @@ workflow { MASK_SEQUENCES(CREATE_REFERENCE_SEQUENCES.out[0], 0, 0) } - DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, CREATE_REFERENCE_SEQUENCES.out[0], MASK_SEQUENCES.out[0], params.parallel) + DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, CREATE_REFERENCE_SEQUENCES.out[0], MASK_SEQUENCES.out[0], params.parallel, QUALITY_CHECK.out[0].collect()) RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], CREATE_REFERENCE_SEQUENCES.out[0], params.codontable, params.resmarkers_amplicon) @@ -71,13 +70,13 @@ workflow { MASK_SEQUENCES(params.refseq_fasta, 0, 0) } - DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, MASK_SEQUENCES.out[0], params.parallel) + DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, MASK_SEQUENCES.out[0], params.parallel, QUALITY_CHECK.out[0].collect()) RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], params.refseq_fasta, params.codontable, params.resmarkers_amplicon) } else { - DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, params.masked_fasta, params.parallel) + DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, params.masked_fasta, params.parallel, QUALITY_CHECK.out[0].collect()) RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], params.refseq_fasta, params.codontable, params.resmarkers_amplicon) @@ -307,6 +306,7 @@ process DADA2_POSTPROC { path refseq_fasta path masked_fasta val parallel + path coverage_files output: path '*.{RDS,txt,csv}' @@ -315,8 +315,7 @@ process DADA2_POSTPROC { if (parallel == true) """ - echo $parallel triggered true - + sample_coverage="\$(echo $coverage_files | tr ' ' '\n' | grep sample_coverage.txt)" seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)" Rscript ${params.scriptDIR}/postdada_rearrange.R \ @@ -325,11 +324,12 @@ process DADA2_POSTPROC { --refseq-fasta ${refseq_fasta} \ --masked-fasta ${masked_fasta} \ --n-cores ${params.n_cores} \ + --sample-coverage \${sample_coverage} \ --parallel """ else """ - echo $parallel triggered false + sample_coverage="\$(echo $coverage_files | tr ' ' '\n' | grep sample_coverage.txt)" seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)" Rscript ${params.scriptDIR}/postdada_rearrange.R \ @@ -337,7 +337,8 @@ process DADA2_POSTPROC { --homopolymer-threshold ${homopolymer_threshold} \ --refseq-fasta ${refseq_fasta} \ --masked-fasta ${masked_fasta} \ - --n-cores ${params.n_cores} + --n-cores ${params.n_cores} \ + --sample-coverage \${sample_coverage} """ }