diff --git a/configs/conda.config b/configs/conda.config index 2274885..9e0c3df 100644 --- a/configs/conda.config +++ b/configs/conda.config @@ -1,9 +1,11 @@ process { withLabel: hisat2 { conda = "$baseDir/envs/hisat2.yaml" } + withLabel: minimap2 { conda = "$baseDir/envs/minimap2.yaml" } withLabel: python3 { conda = "$baseDir/envs/python3.yaml" } withLabel: deseq2 { conda = "$baseDir/envs/deseq2.yaml" } withLabel: fastp { conda = "$baseDir/envs/fastp.yaml" } withLabel: fastqc { conda = "$baseDir/envs/fastqc.yaml" } + withLabel: nanoplot { conda = "$baseDir/envs/nanoplot.yaml" } withLabel: subread { conda = "$baseDir/envs/subread.yaml" } withLabel: multiqc { conda = "$baseDir/envs/multiqc.yaml" } withLabel: sortmerna { conda = "$baseDir/envs/sortmerna.yaml" } diff --git a/configs/container.config b/configs/container.config index 3f73773..35fee78 100644 --- a/configs/container.config +++ b/configs/container.config @@ -1,11 +1,13 @@ process { withLabel: hisat2 { container = 'nanozoo/hisat2:2.1.0--66dae66' } + withLabel: minimap2 { container = 'nanozoo/minimap2:2.18--618fb68' } withLabel: python3 { container = 'nanozoo/python_rnaseq:3.8--7a7808c' } withLabel: deseq2 { container = 'nanozoo/deseq2:1.28.0--0df1612' } withLabel: fastp { container = 'nanozoo/fastp:0.20.0--78a7c63' } withLabel: fastqc { container = 'nanozoo/fastqc:0.11.8--fbfa1d7' } withLabel: subread { container = 'nanozoo/subread:2.0.1--713a8e7' } withLabel: multiqc { container = 'nanozoo/multiqc:1.9--aba729b' } + withLabel: nanoplot { container = 'nanozoo/nanoplot:1.38.0--11e5b96' } withLabel: sortmerna { container = 'nanozoo/sortmerna:2.1b--ceea1a1' } withLabel: trinity { container = 'nanozoo/trinity:2.11.0--3354a71' } withLabel: stringtie { container = 'nanozoo/stringtie:2.1.2--93e41f1' } diff --git a/configs/local.config b/configs/local.config index 8e8790a..6d04308 100644 --- a/configs/local.config +++ b/configs/local.config @@ -1,10 +1,12 @@ process { withLabel: hisat2 { cpus = params.cores } + withLabel: minimap2 { cpus = params.cores } withLabel: deseq2 { cpus = params.cores } withLabel: fastp { cpus = params.cores } withLabel: fastqc { cpus = 1 } withLabel: subread { cpus = params.cores } withLabel: multiqc { cpus = params.cores } + withLabel: nanoplot { cpus = 4 } withLabel: sortmerna { cpus = params.cores } withLabel: trinity { cpus = params.cores; memory = params.memory } withLabel: stringtie { cpus = params.cores} diff --git a/envs/minimap2.yaml b/envs/minimap2.yaml new file mode 100644 index 0000000..9a6095a --- /dev/null +++ b/envs/minimap2.yaml @@ -0,0 +1,6 @@ +name: minimap2 +channels: + - bioconda +dependencies: + - minimap2=2.18 + - samtools=1.11 \ No newline at end of file diff --git a/envs/nanoplot.yaml b/envs/nanoplot.yaml new file mode 100644 index 0000000..ad4f85c --- /dev/null +++ b/envs/nanoplot.yaml @@ -0,0 +1,7 @@ +name: nanoplot +channels: + - bioconda + - conda-forge +dependencies: + - nanoplot=1.38.0 + - seaborn==0.10.1 \ No newline at end of file diff --git a/main.nf b/main.nf index 95f5209..48b8efd 100755 --- a/main.nf +++ b/main.nf @@ -11,7 +11,7 @@ nextflow.enable.dsl=2 // Parameters sanity checking -Set valid_params = ['max_cores', 'cores', 'memory', 'profile', 'help', 'reads', 'genome', 'annotation', 'deg', 'autodownload', 'pathway', 'species', 'include_species', 'strand', 'mode', 'tpm', 'fastp_additional_params', 'hisat2_additional_params', 'featurecounts_additional_params', 'feature_id_type', 'busco_db', 'dammit_uniref90', 'skip_sortmerna', 'assembly', 'output', 'fastp_dir', 'sortmerna_dir', 'hisat2_dir', 'featurecounts_dir', 'tpm_filter_dir', 'annotation_dir', 'deseq2_dir', 'assembly_dir', 'rnaseq_annotation_dir', 'uniref90_dir', 'multiqc_dir', 'nf_runinfo_dir', 'permanentCacheDir', 'condaCacheDir', 'singularityCacheDir', 'softlink_results', 'cloudProcess', 'permanent-cache-dir', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process'] // don't ask me why there is 'permanent-cache-dir', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' +Set valid_params = ['max_cores', 'cores', 'memory', 'profile', 'help', 'reads', 'genome', 'nanopore', 'minimap2_additional_params', 'minimap2_dir', 'annotation', 'deg', 'autodownload', 'pathway', 'species', 'include_species', 'strand', 'mode', 'tpm', 'fastp_additional_params', 'hisat2_additional_params', 'featurecounts_additional_params', 'feature_id_type', 'busco_db', 'dammit_uniref90', 'skip_sortmerna', 'assembly', 'output', 'fastp_dir', 'sortmerna_dir', 'hisat2_dir', 'featurecounts_dir', 'tpm_filter_dir', 'annotation_dir', 'deseq2_dir', 'assembly_dir', 'rnaseq_annotation_dir', 'uniref90_dir', 'readqc_dir', 'multiqc_dir', 'nf_runinfo_dir', 'permanentCacheDir', 'condaCacheDir', 'singularityCacheDir', 'softlink_results', 'cloudProcess', 'permanent-cache-dir', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process'] // don't ask me why there is 'permanent-cache-dir', 'conda-cache-dir', 'singularity-cache-dir', 'cloud-process' def parameter_diff = params.keySet() - valid_params if (parameter_diff.size() != 0){ exit 1, "ERROR: Parameter(s) $parameter_diff is/are not valid in the pipeline!\n" @@ -80,6 +80,11 @@ if (params.assembly) { } } +if (params.nanopore) { + println "\u001B[32mPerform processing of reads in Nanopore mode instead default short-read mode. After mapping, the same steps are used as for Illumina." +} + + Set species = ['hsa', 'eco', 'mmu', 'mau'] Set autodownload = ['hsa', 'eco', 'mmu', 'mau'] Set pathway = ['hsa', 'mmu', 'mau'] @@ -115,6 +120,7 @@ log.info """\ Strandness: $params.strand TPM threshold: $params.tpm Comparisons: $comparison + Nanopore mode: $params.nanopore """ .stripIndent() @@ -138,10 +144,10 @@ if (params.reads) { return [ sample, read, condition, source ] } .tap { annotated_reads } - .tap { illumina_input_ch } + .tap { read_input_ch } .map { sample, read, condition, source -> return [ sample, [ read ] ] } - .set { illumina_input_ch } + .set { read_input_ch } } else { Channel @@ -156,10 +162,10 @@ if (params.reads) { return [ sample, read1, read2, condition, source ] } .tap { annotated_reads } - .tap { illumina_input_ch } + .tap { read_input_ch } .map { sample, read1, read2, condition, source -> return [ sample, [ read1, read2 ] ] } - .set { illumina_input_ch } + .set { read_input_ch } } } @@ -324,6 +330,7 @@ include {referenceGet; concat_genome} from './modules/referenceGet' include {annotationGet; concat_annotation} from './modules/annotationGet' include {sortmernaGet} from './modules/sortmernaGet' include {hisat2index} from './modules/hisat2' +include {minimap2index} from './modules/minimap2' include {buscoGetDB} from './modules/buscoGetDB' include {dammitGetDB} from './modules/dammitGetDB' @@ -331,10 +338,12 @@ include {dammitGetDB} from './modules/dammitGetDB' include {fastp} from './modules/fastp' include {sortmerna} from './modules/sortmerna' include {hisat2; index_bam} from './modules/hisat2' +include {minimap2; index_bam_minimap2} from './modules/minimap2' include {featurecounts} from './modules/featurecounts' include {tpm_filter} from './modules/tpm_filter' include {deseq2} from './modules/deseq2' include {fastqc as fastqcPre; fastqc as fastqcPost} from './modules/fastqc' +include {nanoplot as nanoplot} from './modules/nanoplot' include {multiqc; multiqc_sample_names} from './modules/multiqc' // assembly & annotation @@ -474,20 +483,20 @@ workflow download_dammit { **************************/ /*************************************** -Preprocess RNA-Seq reads: qc, trimming, adapters, rRNA-removal, mapping +Preprocess Illumina RNA-Seq reads: qc, trimming, adapters, rRNA-removal, mapping */ -workflow preprocess { +workflow preprocess_illumina { take: - illumina_input_ch + read_input_ch reference sortmerna_db main: // initial QC of raw reads - fastqcPre(illumina_input_ch) + fastqcPre(read_input_ch) // trim with fastp - fastp(illumina_input_ch) + fastp(read_input_ch) // QC after fastp fastqcPost(fastp.out.sample_trimmed) @@ -513,13 +522,54 @@ workflow preprocess { sample_bam_ch = hisat2.out.sample_bam fastp_json_report = fastp.out.json_report sortmerna_log - hisat2_log = hisat2.out.log - fastqcPre = fastqcPre.out.zip - fastqcPost = fastqcPost.out.zip + mapping_log = hisat2.out.log + readqcPre = fastqcPre.out.zip + readqcPost = fastqcPost.out.zip + cleaned_reads_ch = sortmerna_no_rna_fastq +} + +/*************************************** +Preprocess Illumina RNA-Seq reads: qc, trimming, adapters, rRNA-removal, mapping +*/ +workflow preprocess_nanopore { + take: + read_input_ch + reference + sortmerna_db + + main: + // initial QC of raw reads + nanoplot(read_input_ch) + + if ( params.skip_sortmerna ) { + sortmerna_no_rna_fastq = read_input_ch + sortmerna_log = Channel.empty() + } else { + // remove rRNA with SortmeRNA + sortmerna(read_input_ch, extract_tar_bz2(sortmerna_db)) + sortmerna_no_rna_fastq = sortmerna.out.no_rna_fastq + sortmerna_log = sortmerna.out.log + } + + // Minimap2 index + minimap2index(reference) + // map with Minimap2 + minimap2(sortmerna_no_rna_fastq, minimap2index.out, params.minimap2_additional_params) + // index BAM files + index_bam_minimap2(minimap2.out.sample_bam) + + emit: + sample_bam_ch = minimap2.out.sample_bam + fastp_json_report = Channel.empty() + sortmerna_log + mapping_log = minimap2.out.log + readqcPre = nanoplot.out.zip + readqcPost = Channel.empty() cleaned_reads_ch = sortmerna_no_rna_fastq } + /****************************************** Differential gene expression analysis using a genome reference */ @@ -528,9 +578,9 @@ workflow expression_reference_based { sample_bam_ch fastp_json_report sortmerna_log - hisat2_log - fastqcPre - fastqcPost + mapping_log + readqcPre + readqcPost annotation deg_comparisons_input_ch deseq2_script @@ -580,21 +630,21 @@ workflow expression_reference_based { .map { it.join(",") } // run DEseq2 - deseq2(regionReport_config, tpm_filter.out.filtered_counts, annotated_sample.condition.collect(), - annotated_sample.col_label.collect(), deseq2_comparisons, format_annotation.out, format_annotation_gene_rows.out, - annotated_sample.source.collect(), species_pathway_ch, deseq2_script, deseq2_id_type_ch, deseq2_script_refactor_reportingtools_table, - deseq2_script_improve_deseq_table) + // deseq2(regionReport_config, tpm_filter.out.filtered_counts, annotated_sample.condition.collect(), + // annotated_sample.col_label.collect(), deseq2_comparisons, format_annotation.out, format_annotation_gene_rows.out, + // annotated_sample.source.collect(), species_pathway_ch, deseq2_script, deseq2_id_type_ch, deseq2_script_refactor_reportingtools_table, + // deseq2_script_improve_deseq_table) // run MultiQC multiqc_sample_names(annotated_reads.map{ row -> row[0..-3]}.collect()) multiqc(multiqc_config, multiqc_sample_names.out, - fastp_json_report.collect(), + fastp_json_report.collect().ifEmpty([]), sortmerna_log.collect().ifEmpty([]), - hisat2_log.collect(), + mapping_log.collect(), featurecounts.out.log.collect(), - fastqcPre.collect(), - fastqcPost.collect(), + readqcPre.collect().ifEmpty([]), + readqcPost.collect().ifEmpty([]), tpm_filter.out.stats, params.tpm ) @@ -687,8 +737,12 @@ workflow { sortmerna_db = Channel.empty() } - // preprocess RNA-Seq reads - preprocess(illumina_input_ch, reference, sortmerna_db) + // preprocess RNA-Seq reads (Illumina or Nanopore) + if (!params.nanopore) { + preprocess_illumina(read_input_ch, reference, sortmerna_db) + } else { + preprocess_nanopore(read_input_ch, reference, sortmerna_db) + } // perform assembly & annotation if (params.assembly) { @@ -702,18 +756,33 @@ workflow { } else { // perform expression analysis // start reference-based differential gene expression analysis - expression_reference_based(preprocess.out.sample_bam_ch, - preprocess.out.fastp_json_report, - preprocess.out.sortmerna_log, - preprocess.out.hisat2_log, - preprocess.out.fastqcPre, - preprocess.out.fastqcPost, + if (!params.nanopore) { + expression_reference_based(preprocess_illumina.out.sample_bam_ch, + preprocess_illumina.out.fastp_json_report, + preprocess_illumina.out.sortmerna_log, + preprocess_illumina.out.mapping_log, + preprocess_illumina.out.readqcPre, + preprocess_illumina.out.readqcPost, annotation, deg_comparisons_input_ch, deseq2_script, deseq2_script_refactor_reportingtools_table, deseq2_script_improve_deseq_table, multiqc_config) + } else { + expression_reference_based(preprocess_nanopore.out.sample_bam_ch, + preprocess_nanopore.out.fastp_json_report, + preprocess_nanopore.out.sortmerna_log, + preprocess_nanopore.out.mapping_log, + preprocess_nanopore.out.readqcPre, + preprocess_nanopore.out.readqcPost, + annotation, + deg_comparisons_input_ch, + deseq2_script, + deseq2_script_refactor_reportingtools_table, + deseq2_script_improve_deseq_table, + multiqc_config) + } } } @@ -758,10 +827,12 @@ def helpMSG() { ${c_yellow}Preprocessing options:${c_reset} --mode Either 'single' (single-end) or 'paired' (paired-end) sequencing [default: $params.mode] - --fastp_additional_params additional parameters for fastp [default: $params.fastp_additional_params] + --fastp_additional_params Additional parameters for fastp [default: $params.fastp_additional_params] --skip_sortmerna Skip rRNA removal via SortMeRNA [default: $params.skip_sortmerna] - --hisat2_additional_params additional parameters for HISAT2 [default: $params.hisat2_additional_params] - --featurecounts_additional_params additional parameters for FeatureCounts [default: $params.featurecounts_additional_params] + --hisat2_additional_params Additional parameters for HISAT2 [default: $params.hisat2_additional_params] + --minimap2_additional_params Additional parameters for minimap2 (Nanopore input) [default: $params.minimap2_additional_params] + --featurecounts_additional_params Additional parameters for FeatureCounts [default: $params.featurecounts_additional_params] + --nanopore Activate Nanopore long-read mode (default is Illumina data) [default: $params.nanopore] ${c_yellow}DEG analysis options:${c_reset} --strand 0 (unstranded), 1 (stranded) and 2 (reversely stranded) [default: $params.strand] diff --git a/modules/featurecounts.nf b/modules/featurecounts.nf index 4ef63db..50eebd1 100644 --- a/modules/featurecounts.nf +++ b/modules/featurecounts.nf @@ -19,9 +19,15 @@ process featurecounts { script: if (params.mode == 'single') { - """ - featureCounts -T ${task.cpus} -s ${params.strand} -a ${annotation} -o ${name}.counts.tsv ${additionalParams} ${bam} - """ + if (params.nanopore == true) { + """ + featureCounts -L -T ${task.cpus} -s ${params.strand} -a ${annotation} -o ${name}.counts.tsv ${additionalParams} ${bam} + """ + } else { + """ + featureCounts -T ${task.cpus} -s ${params.strand} -a ${annotation} -o ${name}.counts.tsv ${additionalParams} ${bam} + """ + } } else { """ diff --git a/modules/minimap2.nf b/modules/minimap2.nf new file mode 100644 index 0000000..0f2f3bf --- /dev/null +++ b/modules/minimap2.nf @@ -0,0 +1,61 @@ +/************************************************************************ +* MINIMAP2 INDEX +************************************************************************/ +process minimap2index { + label 'minimap2' + input: + path(reference) + + output: + tuple path(reference), path("${reference.baseName}.mmi") + + script: + """ + minimap2 -x map-ont -d ${reference.baseName}.mmi ${reference} + """ +} + + +/************************************************************************ +* MINIMAP2 +************************************************************************/ +process minimap2 { + label 'minimap2' + + if ( params.softlink_results ) { publishDir "${params.output}/${params.minimap2_dir}", pattern: "*.sorted.bam" } + else { publishDir "${params.output}/${params.minimap2_dir}", mode: 'copy', pattern: "*.sorted.bam" } + + input: + tuple val(sample_name), path(reads) + tuple path(reference), path(index) + val(additionalParams) + + output: + tuple val(sample_name), path("${sample_name}.sorted.bam"), emit: sample_bam + path "${sample_name}_summary.log", emit: log + + script: + """ + minimap2 -a -t ${task.cpus} ${index} ${reads[0]} 2> ${sample_name}_summary.log | samtools view -bS | samtools sort -o ${sample_name}.sorted.bam -T tmp --threads ${task.cpus} + """ +} + + +process index_bam_minimap2 { + label 'minimap2' + label 'smallTask' + + if ( params.softlink_results ) { publishDir "${params.output}/${params.minimap2_dir}", pattern: "*.bai" } + else { publishDir "${params.output}/${params.minimap2_dir}", mode: 'copy', pattern: "*.bai" } + + input: + tuple val(sample_name), path(bam_file) + + output: + path("${bam_file}.bai") + + script: + """ + samtools index ${bam_file} + """ +} diff --git a/modules/nanoplot.nf b/modules/nanoplot.nf new file mode 100644 index 0000000..5031428 --- /dev/null +++ b/modules/nanoplot.nf @@ -0,0 +1,33 @@ +/************************************************************************ +* Nanoplot +************************************************************************/ +process nanoplot { + label 'nanoplot' + publishDir "${params.output}/${params.readqc_dir}/${name}/", mode: 'copy', pattern: "${name}_read_quality_report.html" + publishDir "${params.output}/${params.readqc_dir}/${name}/", mode: 'copy', pattern: "${name}_read_quality.txt" + publishDir "${params.output}/${params.readqc_dir}/${name}/figures", mode: 'copy', pattern: "*.png" + publishDir "${params.output}/${params.readqc_dir}/${name}/vector_figures", mode: 'copy', pattern: "*.pdf" + // addet ignore here - to avoid pipeline breaking + errorStrategy 'ignore' + input: + tuple val(name), path(reads) + output: + path("${name}_nanoplot.zip", emit: zip) optional true + tuple val(name), path("*.html"), path("*.pdf") optional true + tuple val(name), path("${name}_read_quality.txt"), path("*.png") optional true + script: + """ + NanoPlot -t ${task.cpus} --fastq ${reads} --title '${name}' --color darkslategrey --N50 --plots hex --loglength -f png --store + NanoPlot -t ${task.cpus} --pickle NanoPlot-data.pickle --title '${name}' --color darkslategrey --N50 --plots hex --loglength -f pdf + mv NanoPlot-report.html ${name}_read_quality_report.html + mv NanoStats.txt ${name}_read_quality.txt + zip ${name}_nanoplot.zip *.html *.pdf *.png ${name}_read_quality.txt + """ +} + +/* Comments: +We run nanoplot 2 times to get png and pdf files. +The second time its done via the pickle file of the previous run, to save computing time +*/ + + diff --git a/nextflow.config b/nextflow.config index 5503fe6..ed43850 100755 --- a/nextflow.config +++ b/nextflow.config @@ -24,6 +24,7 @@ params { deg = '' autodownload = '' pathway = '' + nanopore = false species = '' // deprecated reminder include_species = false // deprecated reminder @@ -34,6 +35,7 @@ params { tpm = 1 fastp_additional_params = '-5 -3 -W 4 -M 20 -l 15 -x -n 5 -z 6' hisat2_additional_params = '' + minimap2_additional_params = '' featurecounts_additional_params = '-t exon -g gene_id' // default values feature_id_type = 'ensembl_gene_id' @@ -48,6 +50,7 @@ params { fastp_dir = '01-Trimming/fastp' sortmerna_dir = '02-rRNARemoval/SortMeRNA' hisat2_dir = '03-Mapping/HISAT2' + minimap2_dir = '03-Mapping/MINIMAP2' featurecounts_dir = '04-Counting/featureCounts' tpm_filter_dir = '05-CountingFilter/TPM' annotation_dir = '06-Annotation' @@ -57,6 +60,7 @@ params { // if the UniRef90 option for dammit is activated via --dammit_uniref90, this parameter will be set to 'uniref90' for the dammit output uniref90_dir = '' multiqc_dir = 'Summary' + readqc_dir = 'Summary/NanoPlot' nf_runinfo_dir = 'Logs' // location for autodownload data like databases