Merge pull request #64 from fischer-hub/feature_skip_prokka

Feature skip prokka
hoelzer-lab · Oct 16, 2024 · 812b1f8 · 812b1f8
2 parents db1639f + 0951182
commit 812b1f8
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 51 deletions.
diff --git a/bin/mmseq2tsv.py b/bin/mmseq2tsv.py
@@ -89,7 +89,10 @@ def chunks(data, size):
         else:
             blastTable[key].append((queryGene, targetGene, seqSim, orientation))
 
-chunksize = int(len(blastTable) / cores)
+if not blastTable:
+    sys.exit('blastTable dictionary is empty, please check mmseqs2 output TSV is not empty and there is at least one queryStrain != targetStrain.')
+
+chunksize = int(len(blastTable) / cores) if int(len(blastTable) / cores) >= 1 else 1
 
 for idx, item in enumerate(chunks(blastTable, chunksize)):
     with open(f"mmseqs_compressed_chunk{idx}.pkl", 'wb') as f:

diff --git a/modules/prokka.nf b/modules/prokka.nf
@@ -9,7 +9,7 @@ process prokka {
 
   output:
     file("${name}/${name}.gff")
-    tuple val(name), file("${name}/${name}.faa")
+    file("${name}/${name}.faa")
     path("${name}", type: 'dir')
 
   script:

diff --git a/modules/rename.nf b/modules/rename.nf
@@ -3,6 +3,7 @@
 process rename {
   label 'basics'
   publishDir "${params.output}/00-rename", mode: 'copy', pattern: "*_RENAMED.fasta" 
+  tag "$name"
 
   input: 
     tuple val(name), file(fasta)

diff --git a/modules/roary.nf b/modules/roary.nf
@@ -2,7 +2,7 @@
 
 process roary {
   label 'roary'
-  publishDir "${params.output}/02-roary", mode: 'copy', pattern: "${ident}" 
+  publishDir "${params.output}/02-roary", mode: 'copy', pattern: "${ident}"
 
   input: 
     tuple val(ident), file(gff)
@@ -12,7 +12,7 @@ process roary {
 
   script:
     """
-    roary -e --mafft -p ${task.cpus} -v -i ${ident} -r *.gff -f "${ident}" &> ribap_roary_"${ident}".log
+    roary -e --mafft -p ${task.cpus} -v -i ${ident} -r *.gff* -f "${ident}" &> ribap_roary_"${ident}".log
     """
 }
 
diff --git a/modules/strain_ids.nf b/modules/strain_ids.nf
@@ -12,7 +12,7 @@ process strain_ids {
 
   script:
     """
-    for ANNO in *.gff; do
+    for ANNO in *.gff*; do
       BN=\$(basename "\$ANNO" .gff)
       echo \$(grep -vm1 '^#' \$ANNO | awk '{print \$9}' | cut -d'=' -f2 | cut -d'_' -f1),"\$BN"
     done > strain_ids.txt

diff --git a/nextflow.config b/nextflow.config
@@ -26,6 +26,8 @@ params {
   sets = false
   tmlim = 240 // ILP solve time limit in seconds
   chunks = 8 // how many ILP chunks for parallel computing
+  protein_fasta_file = ''
+  annotation_file = ''
 
   // folder structure
   output = 'results'

diff --git a/ribap.nf b/ribap.nf
@@ -44,7 +44,7 @@ if ( workflow.profile.contains('conda') ) {
     println "  $params.condaCacheDir\u001B[0m"
 }
 if (params.profile) { exit 1, "--profile is WRONG use -profile" }
-if (params.fasta == '' ) { exit 1, "input missing, use [--fasta]"}
+if (params.fasta == '' && (!params.annotation_file || !params.protein_fasta_file)) { exit 1, "input missing, use [--fasta] or provide [--annotation_file, --protein_fasta_file]"}
 
 if ( !workflow.revision ) { 
     println ""
@@ -74,6 +74,17 @@ if ( params.keepILPs ) {
     println "\033[0;33mINFORMATION: ILPs and their intermediate results are deleted to save disk space (use --keepILPs to keep them).\033[0m"
 }
 
+if (params.annotation_file && !params.protein_fasta_file) {
+    println ""
+    exit 1, "Custom annotation file was found but no associated protein fasta file containing translated CDS was provided. Please provide a protein fasta file using --protein_fasta_file or run without --annotation_file, which will run Prokka annotation as part of the RIBAP workflow."
+} else if (!params.annotation_file && params.protein_fasta_file) {
+    println ""
+    exit 1, "Protein fasta file was found but no associated custom annotation file was provided. Please provide an annotation file in GFF format using --annotation_file or run without --protein_fasta_file, which will run Prokka annotation as part of the RIBAP workflow."
+} else if (params.annotation_file && params.protein_fasta_file && params.fasta) {
+    println ""
+    println "\033[0;33mINFORMATION: Custom annotation and protein fasta files were provided. Additionally an input fasta file was provided which will be ignored. Skipping Prokka annotation.\033[0m"
+}
+
 /************************** 
 * INPUT CHANNELS 
 **************************/
@@ -141,6 +152,9 @@ if (params.tree) {
   include { iqtree } from './modules/iqtree'
   }
 
+//if (params.annotation_file && params.protein_fasta_file) include { gff_validate } from './modules/gff_validate'
+
+
 
 /************************** 
 * WORKFLOW ENTRY POINT
@@ -150,31 +164,40 @@ if (params.tree) {
 
 workflow RIBAP {
 
-  renamed_fasta_ch = rename(fasta_input_ch)
 
-  if (params.reference && params.list) {
-    prokka_input_ch = renamed_fasta_ch.join(reference_input_ch, remainder: true).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
+  if (params.annotation_file && params.protein_fasta_file){
+
+      gff_ch = Channel.fromPath(params.annotation_file, checkIfExists: true)
+      faa_ch = Channel.fromPath(params.protein_fasta_file, checkIfExists: true)
+                      .collect()
   } else {
-    // this will either produce a channel w/ [sample_RENAMED, fasta_RENAMED, reference_gbk] OR [sample_RENAMED, fasta_RENAMED, null] 
-    prokka_input_ch = renamed_fasta_ch.combine(reference_input_ch).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
-  }
 
-  prokka(prokka_input_ch)
+    renamed_fasta_ch = rename(fasta_input_ch)
+
+    if (params.reference && params.list) {
+      prokka_input_ch = renamed_fasta_ch.join(reference_input_ch, remainder: true).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
+    } else {
+      // this will either produce a channel w/ [sample_RENAMED, fasta_RENAMED, reference_gbk] OR [sample_RENAMED, fasta_RENAMED, null] 
+      prokka_input_ch = renamed_fasta_ch.combine(reference_input_ch).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
+    }
 
-  gff_ch = prokka.out[0]
-  faa_ch = prokka.out[1].collect()
+    prokka(prokka_input_ch)
+    gff_ch = prokka.out[0]
+    faa_ch = prokka.out[1].collect()
+  }
 
-  strain_ids(prokka.out[0].collect())
+  strain_ids(gff_ch.collect())
 
   identity_ch = Channel.from(60, 70, 80, 90, 95)
   roary_run_ch = identity_ch.combine(gff_ch).groupTuple()
   roary(roary_run_ch)
 
   mmseqs2(faa_ch)
+
 
-    ilp_refinement(
-      mmseqs2tsv(mmseqs2.out[0], strain_ids.out).flatten()
-    )
+  ilp_refinement(
+    mmseqs2tsv(mmseqs2.out[0], strain_ids.out).flatten()
+  )
 
 
   // select only the 95 combined output file
@@ -194,7 +217,7 @@ workflow RIBAP {
 
   // // select only the 95 combined output file
   // identity_ch = Channel.from(95)
-  prepare_msa(identity_ch.join(combine_roary_ilp.out[0]), prokka.out[1].map { id, faa -> faa}.collect())
+  prepare_msa(identity_ch.join(combine_roary_ilp.out[0]), faa_ch)
 
   // 50 alignments will be processed one after the other
   nw_display(
@@ -252,41 +275,47 @@ def helpMSG() {
     ${c_dim}  ..change above input to csv:${c_reset} ${c_green}--list ${c_reset}            
 
     ${c_yellow}Params:${c_reset}
-    --tmlim             Time limit for ILP solve [default: $params.tmlim]
-    --chunks            Split ILPs into $params.chunks chunks for parallel computation [default: $params.chunks]
-    --gcode             Genetic code for Prokka annotation [default: $params.gcode]
-    --reference         A reference genbank (gbk, gb) file to guide functional annotation via Prokka.
-                        Attention: when directly provided without the --list parameter, all input genomes 
-                        will be functionally annotated using the same reference. To use different reference files
-                        or to exclude certain genomes from reference-based annotation use the --list option. [defaut: $params.reference]
-    --tree              build tree based on the core genome? 
-                        Sure thing, We will use RAxML for this. 
-                        Be aware, this will take a lot of time. [default: $params.tree]
-    --bootstrap         Bootstrap value for tree building (increases time!). Must be >=1000 for IQ-TREE ultra-fast bootstraps [default: $params.bootstrap] 
-    --core_perc         Define how many species are required so that a gene is considered a core gene for tree calculation.
-                        Per default, RIBAP will only consider genes that were found in all input genomes (100%).
-                        However, this can cause tree calculation to stop when there are no such core genes. 
-                        You can lower the threshold to include more homologous genes into the tree caclulation. 
-                        The total input genome number will be multiplied by this value and rounded down when the 
-                        results is <= x.5 (e.g., 28*0.9=25.2 --> 25) and up otherwise (e.g., 28*0.95=26.6 --> 27).
-                        All RIBAP groups that are composed of genes from different species equal or greater this number 
-                        will be considered in the core gene MSA and tree [default: $params.core_perc]
+    --tmlim               Time limit for ILP solve [default: $params.tmlim]
+    --chunks              Split ILPs into $params.chunks chunks for parallel computation [default: $params.chunks]
+    --gcode               Genetic code for Prokka annotation [default: $params.gcode]
+    --reference           A reference genbank (gbk, gb) file to guide functional annotation via Prokka.
+                          Attention: when directly provided without the --list parameter, all input genomes 
+                          will be functionally annotated using the same reference. To use different reference files
+                          or to exclude certain genomes from reference-based annotation use the --list option. [defaut: $params.reference]
+    --tree                build tree based on the core genome? 
+                          Sure thing, We will use RAxML for this. 
+                          Be aware, this will take a lot of time. [default: $params.tree]
+    --bootstrap           Bootstrap value for tree building (increases time!). Must be >=1000 for IQ-TREE ultra-fast bootstraps [default: $params.bootstrap] 
+    --core_perc           Define how many species are required so that a gene is considered a core gene for tree calculation.
+                          Per default, RIBAP will only consider genes that were found in all input genomes (100%).
+                          However, this can cause tree calculation to stop when there are no such core genes.
+                          You can lower the threshold to include more homologous genes into the tree caclulation. 
+                          The total input genome number will be multiplied by this value and rounded down when the 
+                          results is <= x.5 (e.g., 28*0.9=25.2 --> 25) and up otherwise (e.g., 28*0.95=26.6 --> 27).
+                          All RIBAP groups that are composed of genes from different species equal or greater this number 
+                          will be considered in the core gene MSA and tree [default: $params.core_perc]
+    --annotation_file     Custom annotation file(s) in GFF format. This skips the annotation with Prokka and uses your own annotation
+                          file instead. Note that using this flag requires the usage of --protein_fasta_file too. [default: $params.annotation_file] 
+    --protein_fasta_file  Fasta file containing the translated amino acid sequences associated with the CDSs from the custom annotation file. 
+                          Note that this flag requires the usage of the --annotation_file flag. This will skip the Prokka annotation of the 
+                          workflow and uses your own annotation instead. If --list is set this
+                          expects a CSV file of type 'samplename, path_to_protein_fasta_file'. [default: $params.protein_fasta_file]
 
     ${c_yellow}UpSet plot:${c_reset}
-    --sets              FASTA simpleNames for genomes that should be 
-                        used in the UpSet plotting. Needed format:
-                        "\\"Cav\\",\\"Cab\\",\\"Cga\\",\\"Ctr\\"" [default: $params.sets]
-                        ${c_dim}(sorry, this will be simplified someday)${c_reset}
-    --heigth            Height of the plot [default: $params.heigth]
-    --width             Width of the plot [default: $params.width]
+    --sets                FASTA simpleNames for genomes that should be 
+                          used in the UpSet plotting. Needed format:
+                          "\\"Cav\\",\\"Cab\\",\\"Cga\\",\\"Ctr\\"" [default: $params.sets]
+                          ${c_dim}(sorry, this will be simplified someday)${c_reset}
+    --heigth              Height of the plot [default: $params.heigth]
+    --width               Width of the plot [default: $params.width]
 
     ${c_yellow}Compute options:${c_reset}
-    --cores             max cores used per process for local use [default: $params.cores]
-    --max_cores         max cores used on the machine for local use [default: $params.max_cores]
-    --memory            max memory for local use [default: $params.memory]
-    --output            name of the result folder [default: $params.output]
-    --keepILPs          the ILPs can take a lot (!) of space. Use this flag to keep them in the work dir if necessary.
-                        Attention: You need to set this flag in order to -resume RIBAP w/o recalulating the ILPs. [default: $params.keepILPs]
+    --cores               max cores used per process for local use [default: $params.cores]
+    --max_cores           max cores used on the machine for local use [default: $params.max_cores]
+    --memory              max memory for local use [default: $params.memory]
+    --output              name of the result folder [default: $params.output]
+    --keepILPs            the ILPs can take a lot (!) of space. Use this flag to keep them in the work dir if necessary.
+                          Attention: You need to set this flag in order to -resume RIBAP w/o recalulating the ILPs. [default: $params.keepILPs]
 
     ${c_dim}Nextflow options:
     -with-report rep.html    cpu / ram usage (may cause errors)