Skip to content

Commit

Permalink
Merge pull request #64 from fischer-hub/feature_skip_prokka
Browse files Browse the repository at this point in the history
Feature skip prokka
  • Loading branch information
hoelzer authored Oct 16, 2024
2 parents db1639f + 0951182 commit 812b1f8
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 51 deletions.
5 changes: 4 additions & 1 deletion bin/mmseq2tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ def chunks(data, size):
else:
blastTable[key].append((queryGene, targetGene, seqSim, orientation))

chunksize = int(len(blastTable) / cores)
if not blastTable:
sys.exit('blastTable dictionary is empty, please check mmseqs2 output TSV is not empty and there is at least one queryStrain != targetStrain.')

chunksize = int(len(blastTable) / cores) if int(len(blastTable) / cores) >= 1 else 1

for idx, item in enumerate(chunks(blastTable, chunksize)):
with open(f"mmseqs_compressed_chunk{idx}.pkl", 'wb') as f:
Expand Down
2 changes: 1 addition & 1 deletion modules/prokka.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ process prokka {

output:
file("${name}/${name}.gff")
tuple val(name), file("${name}/${name}.faa")
file("${name}/${name}.faa")
path("${name}", type: 'dir')

script:
Expand Down
1 change: 1 addition & 0 deletions modules/rename.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
process rename {
label 'basics'
publishDir "${params.output}/00-rename", mode: 'copy', pattern: "*_RENAMED.fasta"
tag "$name"

input:
tuple val(name), file(fasta)
Expand Down
4 changes: 2 additions & 2 deletions modules/roary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

process roary {
label 'roary'
publishDir "${params.output}/02-roary", mode: 'copy', pattern: "${ident}"
publishDir "${params.output}/02-roary", mode: 'copy', pattern: "${ident}"

input:
tuple val(ident), file(gff)
Expand All @@ -12,7 +12,7 @@ process roary {

script:
"""
roary -e --mafft -p ${task.cpus} -v -i ${ident} -r *.gff -f "${ident}" &> ribap_roary_"${ident}".log
roary -e --mafft -p ${task.cpus} -v -i ${ident} -r *.gff* -f "${ident}" &> ribap_roary_"${ident}".log
"""
}

2 changes: 1 addition & 1 deletion modules/strain_ids.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ process strain_ids {

script:
"""
for ANNO in *.gff; do
for ANNO in *.gff*; do
BN=\$(basename "\$ANNO" .gff)
echo \$(grep -vm1 '^#' \$ANNO | awk '{print \$9}' | cut -d'=' -f2 | cut -d'_' -f1),"\$BN"
done > strain_ids.txt
Expand Down
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ params {
sets = false
tmlim = 240 // ILP solve time limit in seconds
chunks = 8 // how many ILP chunks for parallel computing
protein_fasta_file = ''
annotation_file = ''

// folder structure
output = 'results'
Expand Down
121 changes: 75 additions & 46 deletions ribap.nf
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ if ( workflow.profile.contains('conda') ) {
println " $params.condaCacheDir\u001B[0m"
}
if (params.profile) { exit 1, "--profile is WRONG use -profile" }
if (params.fasta == '' ) { exit 1, "input missing, use [--fasta]"}
if (params.fasta == '' && (!params.annotation_file || !params.protein_fasta_file)) { exit 1, "input missing, use [--fasta] or provide [--annotation_file, --protein_fasta_file]"}

if ( !workflow.revision ) {
println ""
Expand Down Expand Up @@ -74,6 +74,17 @@ if ( params.keepILPs ) {
println "\033[0;33mINFORMATION: ILPs and their intermediate results are deleted to save disk space (use --keepILPs to keep them).\033[0m"
}

if (params.annotation_file && !params.protein_fasta_file) {
println ""
exit 1, "Custom annotation file was found but no associated protein fasta file containing translated CDS was provided. Please provide a protein fasta file using --protein_fasta_file or run without --annotation_file, which will run Prokka annotation as part of the RIBAP workflow."
} else if (!params.annotation_file && params.protein_fasta_file) {
println ""
exit 1, "Protein fasta file was found but no associated custom annotation file was provided. Please provide an annotation file in GFF format using --annotation_file or run without --protein_fasta_file, which will run Prokka annotation as part of the RIBAP workflow."
} else if (params.annotation_file && params.protein_fasta_file && params.fasta) {
println ""
println "\033[0;33mINFORMATION: Custom annotation and protein fasta files were provided. Additionally an input fasta file was provided which will be ignored. Skipping Prokka annotation.\033[0m"
}

/**************************
* INPUT CHANNELS
**************************/
Expand Down Expand Up @@ -141,6 +152,9 @@ if (params.tree) {
include { iqtree } from './modules/iqtree'
}

//if (params.annotation_file && params.protein_fasta_file) include { gff_validate } from './modules/gff_validate'



/**************************
* WORKFLOW ENTRY POINT
Expand All @@ -150,31 +164,40 @@ if (params.tree) {

workflow RIBAP {

renamed_fasta_ch = rename(fasta_input_ch)

if (params.reference && params.list) {
prokka_input_ch = renamed_fasta_ch.join(reference_input_ch, remainder: true).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
if (params.annotation_file && params.protein_fasta_file){

gff_ch = Channel.fromPath(params.annotation_file, checkIfExists: true)
faa_ch = Channel.fromPath(params.protein_fasta_file, checkIfExists: true)
.collect()
} else {
// this will either produce a channel w/ [sample_RENAMED, fasta_RENAMED, reference_gbk] OR [sample_RENAMED, fasta_RENAMED, null]
prokka_input_ch = renamed_fasta_ch.combine(reference_input_ch).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
}

prokka(prokka_input_ch)
renamed_fasta_ch = rename(fasta_input_ch)

if (params.reference && params.list) {
prokka_input_ch = renamed_fasta_ch.join(reference_input_ch, remainder: true).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
} else {
// this will either produce a channel w/ [sample_RENAMED, fasta_RENAMED, reference_gbk] OR [sample_RENAMED, fasta_RENAMED, null]
prokka_input_ch = renamed_fasta_ch.combine(reference_input_ch).map { id, id_renamed, fasta, gbk -> [id_renamed, fasta, gbk]}
}

gff_ch = prokka.out[0]
faa_ch = prokka.out[1].collect()
prokka(prokka_input_ch)
gff_ch = prokka.out[0]
faa_ch = prokka.out[1].collect()
}

strain_ids(prokka.out[0].collect())
strain_ids(gff_ch.collect())

identity_ch = Channel.from(60, 70, 80, 90, 95)
roary_run_ch = identity_ch.combine(gff_ch).groupTuple()
roary(roary_run_ch)

mmseqs2(faa_ch)


ilp_refinement(
mmseqs2tsv(mmseqs2.out[0], strain_ids.out).flatten()
)
ilp_refinement(
mmseqs2tsv(mmseqs2.out[0], strain_ids.out).flatten()
)


// select only the 95 combined output file
Expand All @@ -194,7 +217,7 @@ workflow RIBAP {

// // select only the 95 combined output file
// identity_ch = Channel.from(95)
prepare_msa(identity_ch.join(combine_roary_ilp.out[0]), prokka.out[1].map { id, faa -> faa}.collect())
prepare_msa(identity_ch.join(combine_roary_ilp.out[0]), faa_ch)

// 50 alignments will be processed one after the other
nw_display(
Expand Down Expand Up @@ -252,41 +275,47 @@ def helpMSG() {
${c_dim} ..change above input to csv:${c_reset} ${c_green}--list ${c_reset}
${c_yellow}Params:${c_reset}
--tmlim Time limit for ILP solve [default: $params.tmlim]
--chunks Split ILPs into $params.chunks chunks for parallel computation [default: $params.chunks]
--gcode Genetic code for Prokka annotation [default: $params.gcode]
--reference A reference genbank (gbk, gb) file to guide functional annotation via Prokka.
Attention: when directly provided without the --list parameter, all input genomes
will be functionally annotated using the same reference. To use different reference files
or to exclude certain genomes from reference-based annotation use the --list option. [defaut: $params.reference]
--tree build tree based on the core genome?
Sure thing, We will use RAxML for this.
Be aware, this will take a lot of time. [default: $params.tree]
--bootstrap Bootstrap value for tree building (increases time!). Must be >=1000 for IQ-TREE ultra-fast bootstraps [default: $params.bootstrap]
--core_perc Define how many species are required so that a gene is considered a core gene for tree calculation.
Per default, RIBAP will only consider genes that were found in all input genomes (100%).
However, this can cause tree calculation to stop when there are no such core genes.
You can lower the threshold to include more homologous genes into the tree caclulation.
The total input genome number will be multiplied by this value and rounded down when the
results is <= x.5 (e.g., 28*0.9=25.2 --> 25) and up otherwise (e.g., 28*0.95=26.6 --> 27).
All RIBAP groups that are composed of genes from different species equal or greater this number
will be considered in the core gene MSA and tree [default: $params.core_perc]
--tmlim Time limit for ILP solve [default: $params.tmlim]
--chunks Split ILPs into $params.chunks chunks for parallel computation [default: $params.chunks]
--gcode Genetic code for Prokka annotation [default: $params.gcode]
--reference A reference genbank (gbk, gb) file to guide functional annotation via Prokka.
Attention: when directly provided without the --list parameter, all input genomes
will be functionally annotated using the same reference. To use different reference files
or to exclude certain genomes from reference-based annotation use the --list option. [defaut: $params.reference]
--tree build tree based on the core genome?
Sure thing, We will use RAxML for this.
Be aware, this will take a lot of time. [default: $params.tree]
--bootstrap Bootstrap value for tree building (increases time!). Must be >=1000 for IQ-TREE ultra-fast bootstraps [default: $params.bootstrap]
--core_perc Define how many species are required so that a gene is considered a core gene for tree calculation.
Per default, RIBAP will only consider genes that were found in all input genomes (100%).
However, this can cause tree calculation to stop when there are no such core genes.
You can lower the threshold to include more homologous genes into the tree caclulation.
The total input genome number will be multiplied by this value and rounded down when the
results is <= x.5 (e.g., 28*0.9=25.2 --> 25) and up otherwise (e.g., 28*0.95=26.6 --> 27).
All RIBAP groups that are composed of genes from different species equal or greater this number
will be considered in the core gene MSA and tree [default: $params.core_perc]
--annotation_file Custom annotation file(s) in GFF format. This skips the annotation with Prokka and uses your own annotation
file instead. Note that using this flag requires the usage of --protein_fasta_file too. [default: $params.annotation_file]
--protein_fasta_file Fasta file containing the translated amino acid sequences associated with the CDSs from the custom annotation file.
Note that this flag requires the usage of the --annotation_file flag. This will skip the Prokka annotation of the
workflow and uses your own annotation instead. If --list is set this
expects a CSV file of type 'samplename, path_to_protein_fasta_file'. [default: $params.protein_fasta_file]
${c_yellow}UpSet plot:${c_reset}
--sets FASTA simpleNames for genomes that should be
used in the UpSet plotting. Needed format:
"\\"Cav\\",\\"Cab\\",\\"Cga\\",\\"Ctr\\"" [default: $params.sets]
${c_dim}(sorry, this will be simplified someday)${c_reset}
--heigth Height of the plot [default: $params.heigth]
--width Width of the plot [default: $params.width]
--sets FASTA simpleNames for genomes that should be
used in the UpSet plotting. Needed format:
"\\"Cav\\",\\"Cab\\",\\"Cga\\",\\"Ctr\\"" [default: $params.sets]
${c_dim}(sorry, this will be simplified someday)${c_reset}
--heigth Height of the plot [default: $params.heigth]
--width Width of the plot [default: $params.width]
${c_yellow}Compute options:${c_reset}
--cores max cores used per process for local use [default: $params.cores]
--max_cores max cores used on the machine for local use [default: $params.max_cores]
--memory max memory for local use [default: $params.memory]
--output name of the result folder [default: $params.output]
--keepILPs the ILPs can take a lot (!) of space. Use this flag to keep them in the work dir if necessary.
Attention: You need to set this flag in order to -resume RIBAP w/o recalulating the ILPs. [default: $params.keepILPs]
--cores max cores used per process for local use [default: $params.cores]
--max_cores max cores used on the machine for local use [default: $params.max_cores]
--memory max memory for local use [default: $params.memory]
--output name of the result folder [default: $params.output]
--keepILPs the ILPs can take a lot (!) of space. Use this flag to keep them in the work dir if necessary.
Attention: You need to set this flag in order to -resume RIBAP w/o recalulating the ILPs. [default: $params.keepILPs]
${c_dim}Nextflow options:
-with-report rep.html cpu / ram usage (may cause errors)
Expand Down

0 comments on commit 812b1f8

Please sign in to comment.