Skip to content

Commit

Permalink
paused
Browse files Browse the repository at this point in the history
append number of mapped amplicons to the sample coverage file (#69)

sample coverage work (#69)
  • Loading branch information
bgpalmer committed Mar 9, 2023
1 parent b13db05 commit 6a68ce2
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 43 deletions.
95 changes: 61 additions & 34 deletions R_code/postdada_rearrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ parser$add_argument('--masked-fasta', type="character")
parser$add_argument('--dada2-output', type="character", required = TRUE)
parser$add_argument('--alignment-threshold', type="integer", default = 60)
parser$add_argument('--parallel', action='store_true')
parser$add_argument('--n-cores', type = 'integer', default = -1)
parser$add_argument('--n-cores', type = 'integer', default = -1, help = "Number of cores to use. Ignored if running parallel flag is unset.")
parser$add_argument('--sample-coverage', type="character", help = "Sample coverage file from QC to append sample coverage statistics that are P. falciparum specific.")

args <- parser$parse_args()
print(args)
Expand All @@ -22,27 +23,29 @@ library(muscle)
library(BSgenome)
library(tidyr)
library(doMC)

library(tibble)

## Postprocessing QC

### Print allele table before masking (for debugging)

seqtab.nochim <- readRDS(args$dada2_output)
seqtab.nochim.df = as.data.frame(seqtab.nochim)
seqtab.nochim.df$sample = rownames(seqtab.nochim)
seqtab.nochim.df[seqtab.nochim.df==0]=NA
pat="-1A_|-1B_|-1_|-2_|-1AB_|-1B2_"
seqtab.nochim.df = seqtab.nochim.df %>%
pivot_longer(cols = seq(1,ncol(seqtab.nochim)),names_to = "asv",values_to = "reads",values_drop_na=TRUE) %>%
mutate(locus = paste0(sapply(strsplit(sample,"_"),"[",1),"_",sapply(strsplit(sample,"_"),"[",2),"_",sapply(strsplit(sample,"_"),"[",3)))%>%
mutate(sampleID = sapply(strsplit(sapply(strsplit(sample,pat),"[",2),"_trimmed"),"[",1)) %>%
seqtab.nochim.df = seqtab.nochim.df %>%
pivot_longer(cols = seq(1,ncol(seqtab.nochim)),names_to = "asv",values_to = "reads",values_drop_na=TRUE) %>%
mutate(locus = paste0(sapply(strsplit(sample,"_"),"[",1),"_",sapply(strsplit(sample,"_"),"[",2),"_",sapply(strsplit(sample,"_"),"[",3)))%>%
mutate(sampleID = sapply(strsplit(sapply(strsplit(sample,pat),"[",2),"_trimmed"),"[",1)) %>%
select(sampleID,locus,asv,reads)

temp = seqtab.nochim.df %>% select(locus,asv) %>% distinct()
loci =unique(temp$locus)
k=1
allele.sequences = data.frame(locus = seq(1,nrow(temp)),allele = seq(1,nrow(temp)),sequence = seq(1,nrow(temp)))
for(i in seq(1,length(loci))){
temp2 = temp %>% filter(locus==loci[i])
temp2 = temp %>% filter(locus==loci[i])
for(j in seq(1,nrow(temp2))){
allele.sequences$locus[k+j-1] = loci[i]
allele.sequences$allele[k+j-1] = paste0(loci[i],".",j)
Expand All @@ -51,12 +54,12 @@ for(i in seq(1,length(loci))){
k=k+nrow(temp2)
}

allele.data = seqtab.nochim.df %>%
left_join(allele.sequences %>% select(-locus),by=c("asv"="sequence")) %>%
allele.data = seqtab.nochim.df %>%
left_join(allele.sequences %>% select(-locus),by=c("asv"="sequence")) %>%
group_by(sampleID,locus,allele) %>%
mutate(norm.reads.allele = reads/sum(reads))%>%
mutate(norm.reads.allele = reads/sum(reads))%>%
group_by(sampleID,locus) %>%
mutate(norm.reads.locus = reads/sum(reads))%>%
mutate(norm.reads.locus = reads/sum(reads))%>%
mutate(n.alleles = n())

saveRDS(allele.data,file="pre_processed_allele_table.RDS")
Expand Down Expand Up @@ -170,12 +173,6 @@ if (length(non_overlaps_idx) > 0) {
combined <- c(combined, paste(c(df_non_overlap[idx, ]$processed_left_sequences, df_non_overlap[idx, ]$processed_right_sequences), collapse = ""))
}
df_non_overlap$combined <- combined
# df_non_overlap,
# combined,
# c(
# processed_left_sequences,
# processed_right_sequences
# ), sep="+")

saveRDS(df_non_overlap,file="non_overlapping_seqs.RDS")
write.table(df_non_overlap,file="non_overlapping_seqs.txt",quote=F,sep="\t",col.names=T,row.names=F)
Expand Down Expand Up @@ -342,22 +339,33 @@ if (!is.null(args$homopolymer_threshold) && args$homopolymer_threshold > 0) {
seqtab.nochim.df$original <- base::rownames(seqtab.nochim.df)
df_seqs <- inner_join(df_seqs, seqtab.nochim.df, by = "original")

seqtab.nochim.df <- df_seqs %>%
select(-c(original, hapseq, refseq, refid, score, indels)) %>%
distinct() %>%
pivot_longer(
cols = -c(asv_prime),
names_to = "sample",
values_to = "counts"
) %>%
group_by(sample, asv_prime) %>%
summarise(counts = sum(counts)) %>%
pivot_wider(names_from = asv_prime, values_from = counts) %>%
# filter(counts != 0) %>%
ungroup()
# seqtab.nochim.df <- df_seqs %>%
# select(-c(original, hapseq, refseq, refid, score, indels)) %>%
# distinct() %>%
# pivot_longer(
# cols = -c(asv_prime),
# names_to = "sample",
# values_to = "counts"
# ) %>%
# group_by(sample, asv_prime) %>%
# summarise(counts = sum(counts)) %>%
# pivot_wider(names_from = asv_prime, values_from = counts) %>%
# # filter(counts != 0) %>%
# ungroup()


seqtab.nochim.df <- df_seqs %>%
group_by(refid, asv_prime) %>%
summarise(across(-c(original, hapseq, refseq, score, indels), sum)) %>%
ungroup() %>%
select(-c(refid))

seqtab.nochim.df <- column_to_rownames(seqtab.nochim.df, var = "asv_prime")
seqtab.nochim.df <- as.data.frame(t(seqtab.nochim.df))
seqtab.nochim.df$sample = rownames(seqtab.nochim.df)
seqtab.nochim.df[seqtab.nochim.df==0]=NA


} else {
seqtab.nochim.df = as.data.frame(seqtab.nochim)
seqtab.nochim.df$sample = rownames(seqtab.nochim)
Expand Down Expand Up @@ -398,11 +406,30 @@ allele.data = seqtab.nochim.df %>%
saveRDS(allele.data,file="allele_data.RDS")
write.table(allele.data,file="allele_data.txt",quote=F,sep="\t",col.names=T,row.names=F)

## QC Postprocessing

if (!is.null(args$sample_coverage) && !file.exists(args$sample_coverage)) {
sample.coverage <- read.table(args$sample_coverage, header = FALSE, sep = "\t") %>%
pivot_wider(names_from = V2, values_from = V3)

qc.postproc <- allele.data %>%
left_join(sample.coverage, by = c("sampleID" = "V1")) %>%
group_by(sampleID) %>%
select(sampleID, Input, `No Dimers`, Amplicons, reads) %>%
group_by(sampleID) %>%
mutate(Amplicons.Pf = sum(reads)) %>%
select(-c(reads)) %>%
distinct() %>%
pivot_longer(cols = c(Input, `No Dimers`, Amplicons, Amplicons.Pf))

write.table(qc.postproc, quote=F,sep='\t',col.names = F, row.names = F, file = args$sample_coverage)
}

# get memory footprint of environment
# out <- as.data.frame(sort( sapply(ls(),function(x){object.size(get(x))})))
# colnames(out) <- "bytes"
# out <- out %>% dplyr::mutate(MB = bytes / 1e6, GB = bytes / 1e9)
# write.csv(out, "postproc_memory_profile.csv")
out <- as.data.frame(sort( sapply(ls(),function(x){object.size(get(x))})))
colnames(out) <- "bytes"
out <- out %>% dplyr::mutate(MB = bytes / 1e6, GB = bytes / 1e9)
write.csv(out, "postproc_memory_profile.csv")

# saveRDS(df_seqs, file = "df_seqs.RDS")
# saveRDS(df_final, file = "df_final.RDS")
19 changes: 10 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,12 @@ workflow {
// All workflows will produce a QC report (cutadapt needed by QC for length statistics)
CUTADAPT(read_pairs_ch, params.amplicon_info, params.cutadapt_minlen, qualfilter)

// QUALITY_CHECK(read_pairs_ch, CUTADAPT.out[1].collect(), params.amplicon_info)
QUALITY_CHECK(read_pairs_ch, CUTADAPT.out[1].collect(), params.amplicon_info)

if (params.QC_only == false) {

DADA2_ANALYSIS(CUTADAPT.out[0].collect(), params.amplicon_info)


// Create reference sequences from genome
if (params.refseq_fasta == null) {

Expand All @@ -59,7 +58,7 @@ workflow {
MASK_SEQUENCES(CREATE_REFERENCE_SEQUENCES.out[0], 0, 0)
}

DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, CREATE_REFERENCE_SEQUENCES.out[0], MASK_SEQUENCES.out[0], params.parallel)
DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, CREATE_REFERENCE_SEQUENCES.out[0], MASK_SEQUENCES.out[0], params.parallel, QUALITY_CHECK.out[0].collect())

RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], CREATE_REFERENCE_SEQUENCES.out[0], params.codontable, params.resmarkers_amplicon)

Expand All @@ -71,13 +70,13 @@ workflow {
MASK_SEQUENCES(params.refseq_fasta, 0, 0)
}

DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, MASK_SEQUENCES.out[0], params.parallel)
DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, MASK_SEQUENCES.out[0], params.parallel, QUALITY_CHECK.out[0].collect())

RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], params.refseq_fasta, params.codontable, params.resmarkers_amplicon)

} else {

DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, params.masked_fasta, params.parallel)
DADA2_POSTPROC(DADA2_ANALYSIS.out[0], params.homopolymer_threshold, params.refseq_fasta, params.masked_fasta, params.parallel, QUALITY_CHECK.out[0].collect())

RESISTANCE_MARKERS(DADA2_POSTPROC.out[0], params.refseq_fasta, params.codontable, params.resmarkers_amplicon)

Expand Down Expand Up @@ -307,6 +306,7 @@ process DADA2_POSTPROC {
path refseq_fasta
path masked_fasta
val parallel
path coverage_files

output:
path '*.{RDS,txt,csv}'
Expand All @@ -315,8 +315,7 @@ process DADA2_POSTPROC {

if (parallel == true)
"""
echo $parallel triggered true
sample_coverage="\$(echo $coverage_files | tr ' ' '\n' | grep sample_coverage.txt)"
seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)"
Rscript ${params.scriptDIR}/postdada_rearrange.R \
Expand All @@ -325,19 +324,21 @@ process DADA2_POSTPROC {
--refseq-fasta ${refseq_fasta} \
--masked-fasta ${masked_fasta} \
--n-cores ${params.n_cores} \
--sample-coverage \${sample_coverage} \
--parallel
"""
else
"""
echo $parallel triggered false
sample_coverage="\$(echo $coverage_files | tr ' ' '\n' | grep sample_coverage.txt)"
seqtab_nochim_rds="\$(echo $rdatafile | tr ' ' '\n' | grep *.RDS)"
Rscript ${params.scriptDIR}/postdada_rearrange.R \
--dada2-output \${seqtab_nochim_rds} \
--homopolymer-threshold ${homopolymer_threshold} \
--refseq-fasta ${refseq_fasta} \
--masked-fasta ${masked_fasta} \
--n-cores ${params.n_cores}
--n-cores ${params.n_cores} \
--sample-coverage \${sample_coverage}
"""
}

Expand Down

0 comments on commit 6a68ce2

Please sign in to comment.