diff --git a/Apptainer b/Apptainer index 3159424..28e1d49 100644 --- a/Apptainer +++ b/Apptainer @@ -3,7 +3,7 @@ From: rocker/r-ubuntu:22.04 %post -apt-get update && apt-get install -y build-essential python3-pip libbz2-dev libsdl1.2-dev liblzma-dev libcurl4-openssl-dev zlib1g-dev libxml2-dev libssl-dev r-cran-tidyverse trf bwa bcftools samtools default-jre && rm -rf /var/lib/apt/lists/* +apt-get update && apt-get install -y build-essential python3-pip libbz2-dev libsdl1.2-dev liblzma-dev libcurl4-openssl-dev zlib1g-dev libxml2-dev libssl-dev trf bwa bcftools samtools default-jre pandoc && rm -rf /var/lib/apt/lists/* pip install cutadapt==4.1 mkdir -p /usr/local/lib/R/etc/ /usr/lib/R/etc/ @@ -12,6 +12,7 @@ R -e 'install.packages("remotes")' # Update apt-get Rscript -e 'install.packages("remotes", version = "2.4.2")' +Rscript -e 'remotes::install_cran("tidyverse", version = "2.0.0")' Rscript -e 'remotes::install_cran("ggbeeswarm", upgrade="never", version="0.6.1")' Rscript -e 'remotes::install_cran("gridExtra",upgrade="never", version = "2.3")' Rscript -e 'remotes::install_cran("rmarkdown",upgrade="never", version = "2.17")' diff --git a/Dockerfile b/Dockerfile index da57f28..0afc861 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM rocker/r-ubuntu:22.04 -RUN apt-get update && apt-get install -y --no-install-recommends build-essential python3-pip libssl-dev libbz2-dev libsdl1.2-dev liblzma-dev libcurl4-openssl-dev zlib1g-dev libxml2-dev r-cran-tidyverse trf bwa bcftools samtools default-jre && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y --no-install-recommends build-essential python3-pip libssl-dev libbz2-dev libsdl1.2-dev liblzma-dev libcurl4-openssl-dev zlib1g-dev libxml2-dev trf bwa bcftools samtools default-jre pandoc && rm -rf /var/lib/apt/lists/* RUN pip install cutadapt==4.1 RUN mkdir -p /usr/local/lib/R/etc/ /usr/lib/R/etc/ @@ -8,6 +8,7 @@ RUN R -e 'install.packages("remotes")' # Update apt-get RUN Rscript -e 'install.packages("remotes", version = "2.4.2")' +RUN Rscript -e 'remotes::install_cran("tidyverse", version = "2.0.0")' RUN Rscript -e 'remotes::install_cran("ggbeeswarm", upgrade="never", version="0.6.1")' RUN Rscript -e 'remotes::install_cran("gridExtra",upgrade="never", version = "2.3")' RUN Rscript -e 'remotes::install_cran("rmarkdown",upgrade="never", version = "2.17")' diff --git a/R_code/dada_overlaps.R b/R_code/dada_overlaps.R index 8fe168f..9f05e03 100644 --- a/R_code/dada_overlaps.R +++ b/R_code/dada_overlaps.R @@ -12,7 +12,6 @@ parser$add_argument('--band-size', type='integer', default=16) parser$add_argument('--omega-a', type='double', default=1e-120) parser$add_argument('--concat-non-overlaps', action='store_true') parser$add_argument('--use-quals', type="character", default="false") -parser$add_argument('--homop-gap-penalty', type='integer') # null if not set, which is the default for dada2 parser$add_argument('--maxEE', type="integer", default=2) @@ -61,16 +60,14 @@ pool=switch( "pseudo" = "pseudo" ) -use_quals=switch( - args$use_quals, - "true" = TRUE, - "false" = FALSE -) - -homop_gap_penalty <- ifelse(args$homop_gap_penalty <= 0, NULL, args$homop_gap_penalty) +# use_quals=switch( +# args$use_quals, +# "true" = TRUE, +# "false" = FALSE +# ) -dadaFs <- dada(derepFs, err=errF, selfConsist=TRUE, multithread=TRUE, verbose=TRUE, pool=pool, BAND_SIZE=args$band_size, OMEGA_A=args$omega_a, USE_QUALS=use_quals, HOMOPOLYMER_GAP_PENALTY=homop_gap_penalty) -dadaRs <- dada(derepRs, err=errR, selfConsist=TRUE, multithread=TRUE, verbose=TRUE, pool=pool, BAND_SIZE=args$band_size, OMEGA_A=args$omega_a, USE_QUALS=use_quals, HOMOPOLYMER_GAP_PENALTY=homop_gap_penalty) +dadaFs <- dada(derepFs, err=errF, selfConsist=TRUE, multithread=TRUE, verbose=TRUE, pool=pool, BAND_SIZE=args$band_size, OMEGA_A=args$omega_a) +dadaRs <- dada(derepRs, err=errR, selfConsist=TRUE, multithread=TRUE, verbose=TRUE, pool=pool, BAND_SIZE=args$band_size, OMEGA_A=args$omega_a) if(args$concat_non_overlaps){ diff --git a/R_code/postdada_rearrange.R b/R_code/postdada_rearrange.R index 36ca819..916bf33 100644 --- a/R_code/postdada_rearrange.R +++ b/R_code/postdada_rearrange.R @@ -11,16 +11,66 @@ parser$add_argument('--parallel', action='store_true') parser$add_argument('--n-cores', type = 'integer', default = -1) args <- parser$parse_args() +print(args) library(stringr) library(dplyr) library(dada2) library(foreach) library(parallel) -library(doMC) library(muscle) library(BSgenome) library(tidyr) +library(doMC) +library(tibble) + + +# FOR DEBUGGING +# args <- list() +# args$homopolymer_threshold <- 5 +# args$refseq_fasta <- "v4_refseq.fasta" +# args$masked_fasta <- "v4_refseq.fasta.2.7.7.80.10.25.3.mask" +# args$dada2_output <- "seqtab.nochim.RDS" +# args$parallel <- FALSE +# args$alignment_threshold <- 60 + +## Postprocessing QC + +seqtab.nochim <- readRDS(args$dada2_output) +seqtab.nochim.df = as.data.frame(seqtab.nochim) +seqtab.nochim.df$sample = rownames(seqtab.nochim) +seqtab.nochim.df[seqtab.nochim.df==0]=NA +pat="-1A_|-1B_|-1_|-2_|-1AB_|-1B2_" +seqtab.nochim.df = seqtab.nochim.df %>% + pivot_longer(cols = seq(1,ncol(seqtab.nochim)),names_to = "asv",values_to = "reads",values_drop_na=TRUE) %>% + mutate(locus = paste0(sapply(strsplit(sample,"_"),"[",1),"_",sapply(strsplit(sample,"_"),"[",2),"_",sapply(strsplit(sample,"_"),"[",3)))%>% + mutate(sampleID = sapply(strsplit(sapply(strsplit(sample,pat),"[",2),"_trimmed"),"[",1)) %>% + select(sampleID,locus,asv,reads) + +temp = seqtab.nochim.df %>% select(locus,asv) %>% distinct() +loci =unique(temp$locus) +k=1 +allele.sequences = data.frame(locus = seq(1,nrow(temp)),allele = seq(1,nrow(temp)),sequence = seq(1,nrow(temp))) +for(i in seq(1,length(loci))){ + temp2 = temp %>% filter(locus==loci[i]) + for(j in seq(1,nrow(temp2))){ + allele.sequences$locus[k+j-1] = loci[i] + allele.sequences$allele[k+j-1] = paste0(loci[i],".",j) + allele.sequences$sequence[k+j-1] = temp2$asv[j] + } + k=k+nrow(temp2) +} + +allele.data = seqtab.nochim.df %>% + left_join(allele.sequences %>% select(-locus),by=c("asv"="sequence")) %>% + group_by(sampleID,locus,allele) %>% + mutate(norm.reads.allele = reads/sum(reads))%>% + group_by(sampleID,locus) %>% + mutate(norm.reads.locus = reads/sum(reads))%>% + mutate(n.alleles = n()) + +saveRDS(allele.data,file="pre_processed_allele_table.RDS") +write.table(allele.data,file="pre_processed_allele_table.txt",quote=F,sep="\t",col.names=T,row.names=F) seqtab.nochim <- readRDS(args$dada2_output) @@ -35,14 +85,6 @@ non_overlaps_idx <- which(str_detect(sequences, paste(rep("N", 10), collapse = " if (length(non_overlaps_idx) > 0) { - # Calculate the difference in length between - # real sequences, and see if the delta sequence - # has no conflicting bases at each position. If - # they do, overwrite with N's and collapse the - # resolved sequences. Else if each position is unique, - # assume that the sequences are part of the resolved - # sequence. - non_overlaps <- sequences[non_overlaps_idx] non_overlaps_split = strsplit(non_overlaps, paste(rep("N", 10), collapse = "")) @@ -130,12 +172,6 @@ if (length(non_overlaps_idx) > 0) { combined <- c(combined, paste(c(df_non_overlap[idx, ]$processed_left_sequences, df_non_overlap[idx, ]$processed_right_sequences), collapse = "")) } df_non_overlap$combined <- combined - # df_non_overlap, - # combined, - # c( - # processed_left_sequences, - # processed_right_sequences - # ), sep="+") saveRDS(df_non_overlap,file="non_overlapping_seqs.RDS") write.table(df_non_overlap,file="non_overlapping_seqs.txt",quote=F,sep="\t",col.names=T,row.names=F) @@ -144,6 +180,21 @@ if (length(non_overlaps_idx) > 0) { sequences[df_non_overlap$column_indexes] <- df_non_overlap$combined } +## Reference table for sequences - this is to reduce memory usage in intermediate files +df.sequences <- data.frame( + sid = sprintf("S%d", 1:length(sequences)), + sequences = sequences +) + +## Setup parallel backend if asked + +if (args$parallel) { + n_cores <- ifelse(args$n_cores <= 0, detectCores(), args$n_cores) + registerDoMC(n_cores) +} else { + registerDoSEQ() +} + ## II. Check for homopolymers. @@ -154,17 +205,15 @@ if (!is.null(args$homopolymer_threshold) && args$homopolymer_threshold > 0) { sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = TRUE) - n_cores <- ifelse(args$n_cores <= 0, detectCores(), args$n_cores) - registerDoMC(n_cores) + # This object contains the aligned ASV sequences df_aln <- NULL df_aln <- foreach(seq1 = 1:length(sequences), .combine = "rbind") %dopar% { - seq_1 <- sequences[seq1] - aln <- pairwiseAlignment(ref_sequences, seq_1, substitutionMatrix = sigma, gapOpening = -8, gapExtension = -5, scoreOnly = FALSE) + aln <- pairwiseAlignment(ref_sequences, sequences[seq1], substitutionMatrix = sigma, gapOpening = -8, gapExtension = -5, scoreOnly = FALSE) num <- which.max(score(aln)) patt <- c(alignedPattern(aln[num]), alignedSubject(aln[num])) ind <- sum(str_count(as.character(patt),"-")) data.frame( - original = seq_1, + sid = df.sequences[seq1,]$sid, hapseq = as.character(patt)[2], refseq = as.character(patt)[1], refid = names(patt)[1], @@ -173,287 +222,159 @@ if (!is.null(args$homopolymer_threshold) && args$homopolymer_threshold > 0) { ) } - saveRDS(df_aln,file="alignments.RDS") - write.table(df_aln,file="alignments.txt",quote=F,sep="\t",col.names=T,row.names=F) + # saveRDS(df_aln,file="alignments.RDS") + # write.table(df_aln,file="alignments.txt",quote=F,sep="\t",col.names=T,row.names=F) df_aln <- df_aln %>% filter(score > args$alignment_threshold) masked_sequences <- readDNAStringSet(args$masked_fasta) df_masked <- NULL - if (args$parallel) { - df_masked <- foreach(seq1 = 1:nrow(df_aln), .combine = "rbind") %dopar% { - - seq_1 <- DNAString(df_aln$refseq[seq1]) - - asv_prime <- DNAString(df_aln$hapseq[seq1]) - ref_rle <- Rle(as.vector(seq_1)) + df_masked <- foreach(seq1 = 1:nrow(df_aln), .combine = "rbind") %dopar% { - # additionally, look for potential homopolymers that would exist in our - # pairwise alignment reference sequence if it were not for insertions - # in the sample haplotype in those regions. + seq_1 <- DNAString(df_aln$refseq[seq1]) - ref_rge <- ranges(ref_rle) - mask_ranges <- IRanges() - for (dna_base in DNA_ALPHABET[1:4]) { - dna_ranges <- reduce(ref_rge[runValue(ref_rle) == dna_base | runValue(ref_rle) == "-"]) + asv_prime <- DNAString(df_aln$hapseq[seq1]) + ref_rle <- Rle(as.vector(seq_1)) - vb <- NULL - for (i in 1:length(dna_ranges)) { - vb <- c(vb, sum(as.vector(seq_1[dna_ranges[i]]) %in% dna_base) > args$homopolymer_threshold) - } - - mask_ranges <- append(mask_ranges, dna_ranges[vb]) - } - - maskseq <- getSeq(masked_sequences, df_aln$refid[seq1]) - trseq <- DNAString(as.character(maskseq)) - tr_rle <- Rle(as.vector(trseq)) - - tr_rge <- ranges(tr_rle)[runValue(tr_rle) == "N"] - gap_range <- ref_rge[runValue(ref_rle) == "-"] - - if (length(tr_rge) > 0) { - for (i in 1:length(tr_rge)) { - n_inserts <- sum(as.vector(seq_1[1:start(tr_rge[i])]) == '-') - gaps_over_tr_rge <- gap_range[gap_range %over% shift(tr_rge[i], n_inserts)] - n_over_range_gaps <- ifelse(length(gaps_over_tr_rge) == 0, 0, width(gaps_over_tr_rge)) - new_range <- IRanges(start = start(tr_rge[i]) + n_inserts, end = end(tr_rge[i]) + n_inserts + n_over_range_gaps) - - seq_1[new_range] <- "N" # mask refseq prime - if (length(gaps_over_tr_rge) > 0) { - for (j in 1:length(gaps_over_tr_rge)) { - seq_1[gaps_over_tr_rge[j]] <- "-" - } - } - } - } + # additionally, look for potential homopolymers that would exist in our + # pairwise alignment reference sequence if it were not for insertions + # in the sample haplotype in those regions. - tr_masks <- IRanges() - ref_rle <- Rle(as.vector(seq_1)) - ref_rge <- ranges(ref_rle) - tr_ranges <- reduce(ref_rge[runValue(ref_rle) == 'N' | runValue(ref_rle) == "-"]) + ref_rge <- ranges(ref_rle) + mask_ranges <- IRanges() + for (dna_base in DNA_ALPHABET[1:4]) { + dna_ranges <- reduce(ref_rge[runValue(ref_rle) == dna_base | runValue(ref_rle) == "-"]) - if (length(tr_ranges) > 0) { - for (i in 1:length(tr_ranges)) { - vseq <- as.vector(seq_1[tr_ranges[i]]) - if (all(c('N', '-') %in% vseq) || 'N' %in% vseq) { - mask_ranges <- append(mask_ranges, tr_ranges[i]) - } - } + vb <- NULL + for (i in 1:length(dna_ranges)) { + vb <- c(vb, sum(as.vector(seq_1[dna_ranges[i]]) %in% dna_base) > args$homopolymer_threshold) } - mask_ranges <- unique(sort(mask_ranges)) - mask_ranges <- reduce(mask_ranges) - reference_ranges <- mask_ranges - - pos <- c(DNA_ALPHABET[1:4], "N") - if (length(mask_ranges) > 0) { - for (i in 1:length(mask_ranges)) { + mask_ranges <- append(mask_ranges, dna_ranges[vb]) + } - vec_subseq <- as.vector(seq_1[reference_ranges[i]]) - ibase <- which(DNA_ALPHABET[1:4] %in% vec_subseq) - dna_base <- ifelse(length(ibase) == 0, "N", DNA_ALPHABET[1:4][ibase]) + maskseq <- getSeq(masked_sequences, df_aln$refid[seq1]) + trseq <- DNAString(as.character(maskseq)) + tr_rle <- Rle(as.vector(trseq)) - n_base <- sum(vec_subseq == dna_base) - n_gaps <- sum(vec_subseq == '-') - mask_ranges[i] <- mask_ranges[i] + sum(dna_base != 'N') + tr_rge <- ranges(tr_rle)[runValue(tr_rle) == "N"] + gap_range <- ref_rge[runValue(ref_rle) == "-"] - replacement <- NULL + if (length(tr_rge) > 0) { + for (i in 1:length(tr_rge)) { + n_inserts <- sum(as.vector(seq_1[1:start(tr_rge[i])]) == '-') + gaps_over_tr_rge <- gap_range[gap_range %over% shift(tr_rge[i], n_inserts)] + n_over_range_gaps <- ifelse(length(gaps_over_tr_rge) == 0, 0, width(gaps_over_tr_rge)) + new_range <- IRanges(start = start(tr_rge[i]) + n_inserts, end = end(tr_rge[i]) + n_inserts + n_over_range_gaps) - # for homopolymers, replacement should be N * length of the mask - # for tandem repeats, replacement should be length of original N mask - trun <- end(mask_ranges[i]) - length(asv_prime) - trun <- ifelse(trun < 0, 0, trun) - if (dna_base %in% DNA_ALPHABET[1:4]) { - replacement <- paste(as(Rle('N', width(mask_ranges[i]) - n_gaps - trun), 'character'), collapse = "") - } else { - replacement <- paste(as(Rle('N', n_base - trun), 'character'), collapse = "") + seq_1[new_range] <- "N" # mask refseq prime + if (length(gaps_over_tr_rge) > 0) { + for (j in 1:length(gaps_over_tr_rge)) { + seq_1[gaps_over_tr_rge[j]] <- "-" } - - left_str <- ifelse(start(mask_ranges[i]) <= 1, "", as(subseq(asv_prime, 1, start(mask_ranges[i]) - 1), "character")) - right_str <- ifelse(end(mask_ranges[i]) >= length(asv_prime), "", as(subseq(asv_prime, end(mask_ranges[i]) + 1, length(asv_prime)), "character")) - asv_prime <- DNAString(paste(c(left_str, replacement, right_str), collapse = "")) - - mask_ranges <- shift(mask_ranges, -n_gaps) } } - - # if there is nothing to mask, just return - if (length(mask_ranges) == 0) { - return ( - data.frame( - original = df_aln[seq1, ]$original, - refid = df_aln[seq1, ]$refid, - refseq = df_aln[seq1, ]$refseq, - hapseq = df_aln[seq1, ]$hapseq, - asv_prime = as.character(asv_prime) - ) - ) - } - - data.frame( - original = df_aln[seq1, ]$original, - refid = df_aln$refid[seq1], - refseq = df_aln[seq1, ]$refseq, - hapseq = df_aln[seq1, ]$hapseq, - asv_prime = as.character(asv_prime) - ) } - } else { - for (seq1 in 1:nrow(df_aln)) { - - seq_1 <- DNAString(df_aln$refseq[seq1]) - - asv_prime <- DNAString(df_aln$hapseq[seq1]) - ref_rle <- Rle(as.vector(seq_1)) - # additionally, look for potential homopolymers that would exist in our - # pairwise alignment reference sequence if it were not for insertions - # in the sample haplotype in those regions. + tr_masks <- IRanges() + ref_rle <- Rle(as.vector(seq_1)) + ref_rge <- ranges(ref_rle) + tr_ranges <- reduce(ref_rge[runValue(ref_rle) == 'N' | runValue(ref_rle) == "-"]) - ref_rge <- ranges(ref_rle) - mask_ranges <- IRanges() - for (dna_base in DNA_ALPHABET[1:4]) { - dna_ranges <- reduce(ref_rge[runValue(ref_rle) == dna_base | runValue(ref_rle) == "-"]) - - vb <- NULL - for (i in 1:length(dna_ranges)) { - vb <- c(vb, sum(as.vector(seq_1[dna_ranges[i]]) %in% dna_base) > args$homopolymer_threshold) + if (length(tr_ranges) > 0) { + for (i in 1:length(tr_ranges)) { + vseq <- as.vector(seq_1[tr_ranges[i]]) + if (all(c('N', '-') %in% vseq) || 'N' %in% vseq) { + mask_ranges <- append(mask_ranges, tr_ranges[i]) } - - mask_ranges <- append(mask_ranges, dna_ranges[vb]) } + } - maskseq <- getSeq(masked_sequences, df_aln$refid[seq1]) + mask_ranges <- unique(sort(mask_ranges)) + mask_ranges <- reduce(mask_ranges) + reference_ranges <- mask_ranges - trseq <- DNAString(as.character(maskseq)) - tr_rle <- Rle(as.vector(trseq)) + pos <- c(DNA_ALPHABET[1:4], "N") + if (length(mask_ranges) > 0) { + for (i in 1:length(mask_ranges)) { - tr_rge <- ranges(tr_rle)[runValue(tr_rle) == "N"] - gap_range <- ref_rge[runValue(ref_rle) == "-"] + vec_subseq <- as.vector(seq_1[reference_ranges[i]]) + ibase <- which(DNA_ALPHABET[1:4] %in% vec_subseq) + dna_base <- ifelse(length(ibase) == 0, "N", DNA_ALPHABET[1:4][ibase]) - if (length(tr_rge) > 0) { - for (i in 1:length(tr_rge)) { - n_inserts <- sum(as.vector(seq_1[1:start(tr_rge[i])]) == '-') + n_base <- sum(vec_subseq == dna_base) + n_gaps <- sum(vec_subseq == '-') + mask_ranges[i] <- mask_ranges[i] + sum(dna_base != 'N') - gaps_over_tr_rge <- gap_range[gap_range %over% shift(tr_rge[i], n_inserts)] - n_over_range_gaps <- ifelse(length(gaps_over_tr_rge) == 0, 0, width(gaps_over_tr_rge)) - new_range <- IRanges(start = start(tr_rge[i]) + n_inserts, end = end(tr_rge[i]) + n_inserts + n_over_range_gaps) + replacement <- NULL - seq_1[new_range] <- "N" # mask refseq prime - if (length(gaps_over_tr_rge) > 0) { - for (j in 1:length(gaps_over_tr_rge)) { - seq_1[gaps_over_tr_rge[j]] <- "-" - } - } + # for homopolymers, replacement should be N * length of the mask + # for tandem repeats, replacement should be length of original N mask + trun <- end(mask_ranges[i]) - length(asv_prime) + trun <- ifelse(trun < 0, 0, trun) + if (dna_base %in% DNA_ALPHABET[1:4]) { + replacement <- paste(as(Rle('N', width(mask_ranges[i]) - n_gaps - trun), 'character'), collapse = "") + } else { + replacement <- paste(as(Rle('N', n_base - trun), 'character'), collapse = "") } - } - tr_masks <- IRanges() - ref_rle <- Rle(as.vector(seq_1)) - ref_rge <- ranges(ref_rle) - tr_ranges <- reduce(ref_rge[runValue(ref_rle) == 'N' | runValue(ref_rle) == "-"]) + left_str <- ifelse(start(mask_ranges[i]) <= 1, "", as(subseq(asv_prime, 1, start(mask_ranges[i]) - 1), "character")) + right_str <- ifelse(end(mask_ranges[i]) >= length(asv_prime), "", as(subseq(asv_prime, end(mask_ranges[i]) + 1, length(asv_prime)), "character")) + asv_prime <- DNAString(paste(c(left_str, replacement, right_str), collapse = "")) - if (length(tr_ranges) > 0) { - for (i in 1:length(tr_ranges)) { - vseq <- as.vector(seq_1[tr_ranges[i]]) - if (all(c('N', '-') %in% vseq) || 'N' %in% vseq) { - mask_ranges <- append(mask_ranges, tr_ranges[i]) - } - } - } - - mask_ranges <- unique(sort(mask_ranges)) - reference_ranges <- mask_ranges - - pos <- c(DNA_ALPHABET[1:4], "N") - if (length(mask_ranges) > 0) { - for (i in 1:length(mask_ranges)) { - - vec_subseq <- as.vector(seq_1[reference_ranges[i]]) - ibase <- which(DNA_ALPHABET[1:4] %in% vec_subseq) - dna_base <- ifelse(length(ibase) == 0, "N", DNA_ALPHABET[1:4][ibase]) - - n_base <- sum(vec_subseq == dna_base) - n_gaps <- sum(vec_subseq == '-') - mask_ranges[i] <- mask_ranges[i] + sum(dna_base != 'N') - - replacement <- NULL - - # for homopolymers, replacement should be N * length of the mask - # for tandem repeats, replacement should be length of original N mask - trun <- end(mask_ranges[i]) - length(asv_prime) - trun <- ifelse(trun < 0, 0, trun) - if (dna_base %in% DNA_ALPHABET[1:4]) { - replacement <- paste(as(Rle('N', width(mask_ranges[i]) - n_gaps - trun), 'character'), collapse = "") - } else { - replacement <- paste(as(Rle('N', n_base - trun), 'character'), collapse = "") - } - - left_str <- ifelse(start(mask_ranges[i]) <= 1, "", as(subseq(asv_prime, 1, start(mask_ranges[i]) - 1), "character")) - right_str <- ifelse(end(mask_ranges[i]) >= length(asv_prime), "", as(subseq(asv_prime, end(mask_ranges[i]) + 1, length(asv_prime)), "character")) - asv_prime <- DNAString(paste(c(left_str, replacement, right_str), collapse = "")) - - mask_ranges <- shift(mask_ranges, -n_gaps) - } + mask_ranges <- shift(mask_ranges, -n_gaps) } - - # if there is nothing to mask, just return - if (length(mask_ranges) == 0) { - df_masked <- rbind(df_masked, - data.frame( - original = df_aln[seq1, ]$original, - refid = df_aln[seq1, ]$refid, - refseq = df_aln[seq1, ]$refseq, - hapseq = df_aln[seq1, ]$hapseq, - asv_prime = as.character(asv_prime) - ) - ) - - next - } - - df_masked <- rbind(df_masked, - data.frame( - original = df_aln[seq1, ]$original, - refid = df_aln[seq1, ]$refid, - refseq = df_aln[seq1, ]$refseq, - hapseq = df_aln[seq1, ]$hapseq, - asv_prime = as.character(asv_prime) - ) - ) } - } - - df_seqs <- inner_join(df_aln, df_masked, by = c("original", "refid", "refseq", "hapseq")) - - seqtab.nochim.df <- as.data.frame(t(seqtab.nochim)) - seqtab.nochim.df$original <- base::rownames(seqtab.nochim.df) - df_seqs <- inner_join(df_seqs, seqtab.nochim.df, by = "original") - df_final <- df_seqs %>% - pivot_longer( - cols = -c(original, hapseq, refseq, refid, score, indels, asv_prime), - names_to = "sample", - values_to = "counts" + data.frame( + sid = df_aln[seq1, ]$sid, + refid = df_aln[seq1, ]$refid, + asv_prime = as.character(asv_prime) ) + } - seqtab.nochim.df <- df_final %>% - group_by(sample, asv_prime) %>% - summarise(counts = sum(counts)) %>% - pivot_wider(names_from = asv_prime, values_from = counts) %>% - # filter(counts != 0) %>% - ungroup() - + # df_seqs <- inner_join(df_aln, df_masked, by = c("original", "refid", "refseq", "hapseq")) + + seqtab.nochim.df <- tibble::rownames_to_column(as.data.frame(t(seqtab.nochim)), "sequences") %>% + inner_join(df.sequences, by = c("sequences")) %>% + select(-c(sequences)) + + df_seqs <- df_aln %>% + ungroup() %>% + select(sid, refid) %>% + distinct() %>% + inner_join( + df_masked %>% + select(sid, refid, asv_prime) %>% + distinct() + , by = c("sid", "refid")) + + seqtab.nochim.df <- df_seqs %>% + inner_join(seqtab.nochim.df, by = c("sid")) %>% + group_by(sid, refid, asv_prime) %>% + summarise(across(everything(), sum)) %>% + ungroup() %>% + select(-c(sid,refid)) + + seqtab.nochim.df <- as.data.frame(seqtab.nochim.df) + rownames(seqtab.nochim.df) <- seqtab.nochim.df$asv_prime + seqtab.nochim.df <- seqtab.nochim.df %>% select(-c(asv_prime)) + + seqtab.nochim.df <- as.data.frame(t(seqtab.nochim.df)) + + seqtab.nochim.df <- tibble::rownames_to_column(seqtab.nochim.df, "sample") + seqtab.nochim.df <- seqtab.nochim.df %>% arrange(sample) seqtab.nochim.df[seqtab.nochim.df==0]=NA + } else { seqtab.nochim.df = as.data.frame(seqtab.nochim) seqtab.nochim.df$sample = rownames(seqtab.nochim) seqtab.nochim.df[seqtab.nochim.df==0]=NA } +print("Done masking sequences...") pat="-1A_|-1B_|-1_|-2_|-1AB_|-1B2_" seqtab.nochim.df = seqtab.nochim.df %>% @@ -488,4 +409,8 @@ allele.data = seqtab.nochim.df %>% saveRDS(allele.data,file="allele_data.RDS") write.table(allele.data,file="allele_data.txt",quote=F,sep="\t",col.names=T,row.names=F) - +# get memory footprint of environment +out <- as.data.frame(sort( sapply(ls(),function(x){object.size(get(x))}))) +colnames(out) <- "bytes" +out <- out %>% dplyr::mutate(MB = bytes / 1e6, GB = bytes / 1e9) +write.csv(out, "postproc_memory_profile.csv") diff --git a/envs/trf-env.yml b/envs/trf-env.yml index a1c1834..838f628 100644 --- a/envs/trf-env.yml +++ b/envs/trf-env.yml @@ -2,6 +2,5 @@ name: trf channels: - conda-forge - bioconda - - defaults dependencies: - - trf \ No newline at end of file + - trf=4.09.1=hec16e2b_2 \ No newline at end of file diff --git a/main.nf b/main.nf index f84baa7..4bdc318 100644 --- a/main.nf +++ b/main.nf @@ -23,13 +23,6 @@ params.scriptDIR = "$projectDir/R_code" // codontable = "$projectDir/resources/${params.target}/codontable.txt" params.resmarkers_amplicon = "$projectDir/resources/${params.target}/resistance_markers_amplicon_${params.target}.txt" params.codontable = "$projectDir/templates/codontable.txt" -params.pool = "false" -params.band_size = 16 // default -params.omega_a = 1e-120 -params.use_quals = "false" -params.homop_gap_penalty = 5 -params.maxEE = 2 -params.n_cores = 4 // Files @@ -294,7 +287,6 @@ process DADA2_ANALYSIS { --band-size ${params.band_size} \ --omega-a ${params.omega_a} \ --use-quals ${params.use_quals} \ - --homop-gap-penalty ${params.homop_gap_penalty} \ --maxEE ${params.maxEE} \ --concat-non-overlaps """ @@ -320,7 +312,11 @@ process DADA2_POSTPROC { path '*.{RDS,txt,csv}' script: + + if (parallel == true) """ + echo $parallel triggered true + seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)" Rscript ${params.scriptDIR}/postdada_rearrange.R \ @@ -331,6 +327,18 @@ process DADA2_POSTPROC { --n-cores ${params.n_cores} \ --parallel """ + else + """ + echo $parallel triggered false + seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)" + + Rscript ${params.scriptDIR}/postdada_rearrange.R \ + --dada2-output \${seqtab_nochim_rds} \ + --homopolymer-threshold ${homopolymer_threshold} \ + --refseq-fasta ${refseq_fasta} \ + --masked-fasta ${masked_fasta} \ + --n-cores ${params.n_cores} + """ } diff --git a/nextflow.config b/nextflow.config index 783fccd..d875c7c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,13 +22,21 @@ params { genome = null target = null parallel = true - concat_non_overlaps = true + n_cores = 4 // trf add_mask = true trf_min_score = 25 trf_max_period = 3 + // dada + pool = "false" // options: false, pseudo, true + band_size = 16 // default + omega_a = 1e-120 + use_quals = false + maxEE = 2 + concat_non_overlaps = true + // config options custom_config_base = "conf" @@ -66,7 +74,7 @@ profiles { docker.enabled = true docker.userEmulation = true apptainer.enabled = false - process.container = "eppicenter/mad4hatter:latest" + process.container = "eppicenter/test:latest" } apptainer { conda.enabled = false @@ -88,12 +96,12 @@ profiles { } manifest { - name = 'aarandad/ampseq_workflow' - homePage = 'https://github.com/aarandad/ampseq_workflow' + name = 'eppicenter/MAD4HATTER' + homePage = 'https://github.com/eppicenter/mad4hatter' description = 'An open-source analysis pipeline to detect multiplexed amplicons' mainScript = 'main.nf' nextflowVersion = '!>=21.10.6' - version = '1.1.0' + version = '0.0.9' } env {