diff --git a/tasks/utilities/data_export/task_broad_terra_tools.wdl b/tasks/utilities/data_export/task_broad_terra_tools.wdl index 898f1e7b3..3a3fba0fd 100644 --- a/tasks/utilities/data_export/task_broad_terra_tools.wdl +++ b/tasks/utilities/data_export/task_broad_terra_tools.wdl @@ -76,9 +76,6 @@ task export_taxon_tables { Float? nanoplot_r1_mean_q_clean Float? nanoplot_r1_median_q_clean Float? nanoplot_r1_est_coverage_clean - String? kmc_est_genome_length - File? kmc_kmer_stats - String? kmc_version String? rasusa_version File? tiptoft_plasmid_replicon_fastq String? tiptoft_plasmid_replicon_genes @@ -490,9 +487,6 @@ task export_taxon_tables { "nanoplot_r1_mean_q_clean": "~{nanoplot_r1_mean_q_clean}", "nanoplot_r1_median_q_clean": "~{nanoplot_r1_median_q_clean}", "nanoplot_r1_est_coverage_clean": "~{nanoplot_r1_est_coverage_clean}", - "kmc_est_genome_length": "~{kmc_est_genome_length}", - "kmc_kmer_stats": "~{kmc_kmer_stats}", - "kmc_version": "~{kmc_version}", "rasusa_version": "~{rasusa_version}", "tiptoft_plasmid_replicon_fastq": "~{tiptoft_plasmid_replicon_fastq}", "tiptoft_plasmid_replicon_genes": "~{tiptoft_plasmid_replicon_genes}", diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 0e15f1b4f..c4349356d 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -631,7 +631,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 14565031f96d01ee6480bb0f9d19551d + md5sum: 4d69a6539b68503af9f3f1c2787ff920 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl md5sum: 6d9dd969e2144ca23f2a0e101e6b6966 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 56c5b4e44..0e26098a8 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -594,7 +594,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl - md5sum: 14565031f96d01ee6480bb0f9d19551d + md5sum: 4d69a6539b68503af9f3f1c2787ff920 - path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl md5sum: 5aa25e4fad466f92c96a7c138aca0d20 - path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index bb842cec3..7c961cbe2 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -12,7 +12,6 @@ import "../../tasks/species_typing/betacoronavirus/task_pangolin.wdl" as pangoli import "../../tasks/species_typing/lentivirus/task_quasitools.wdl" as quasitools import "../../tasks/task_versioning.wdl" as versioning import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task -import "../../workflows/utilities/wf_influenza_antiviral_substitutions.wdl" as flu_antiviral import "../utilities/wf_flu_track.wdl" as run_flu_track import "../utilities/wf_organism_parameters.wdl" as set_organism_defaults import "../utilities/wf_read_QC_trim_ont.wdl" as read_qc_trim_workflow @@ -28,7 +27,7 @@ workflow theiacov_ont { # sequencing values String seq_method = "OXFORD_NANOPORE" File? primer_bed - # assembly parameters + # assembly parameters - sars-cov-2 specific Int normalise = 200 Int max_length = 700 Int min_length = 400 @@ -97,7 +96,7 @@ workflow theiacov_ont { input: read1 = read1, samplename = samplename, - genome_length = genome_length, + genome_length = organism_parameters.genome_length, min_length = min_length, max_length = max_length, run_prefix = run_prefix, diff --git a/workflows/theiaprok/wf_theiaprok_ont.wdl b/workflows/theiaprok/wf_theiaprok_ont.wdl index 569ab5a60..fecfb744e 100644 --- a/workflows/theiaprok/wf_theiaprok_ont.wdl +++ b/workflows/theiaprok/wf_theiaprok_ont.wdl @@ -273,9 +273,6 @@ workflow theiaprok_ont { nanoplot_r1_mean_q_clean = nanoplot_clean.mean_q, nanoplot_r1_median_q_clean = nanoplot_clean.median_q, nanoplot_r1_est_coverage_clean = nanoplot_clean.est_coverage, - kmc_est_genome_length = read_qc_trim.est_genome_length, - kmc_kmer_stats = read_qc_trim.kmc_kmer_stats, - kmc_version = read_qc_trim.kmc_version, rasusa_version = read_qc_trim.rasusa_version, tiptoft_plasmid_replicon_fastq = read_qc_trim.tiptoft_plasmid_replicon_fastq, tiptoft_plasmid_replicon_genes = read_qc_trim.tiptoft_plasmid_replicon_genes, @@ -588,10 +585,6 @@ workflow theiaprok_ont { String? kraken2_report = read_qc_trim.kraken_report String? kraken2_database = read_qc_trim.kraken_database String? kraken_docker = read_qc_trim.kraken_docker - # Read QC - kmc outputs - Int? kmc_est_genome_length = read_qc_trim.est_genome_length - File? kmc_kmer_stats = read_qc_trim.kmc_kmer_stats - String? kmc_version = read_qc_trim.kmc_version # Read QC - rasusa outputs String? rasusa_version = read_qc_trim.rasusa_version # Read QC - tiptoft outputs diff --git a/workflows/utilities/wf_read_QC_trim_ont.wdl b/workflows/utilities/wf_read_QC_trim_ont.wdl index 33469e9bb..c03141251 100644 --- a/workflows/utilities/wf_read_QC_trim_ont.wdl +++ b/workflows/utilities/wf_read_QC_trim_ont.wdl @@ -5,17 +5,20 @@ import "../../tasks/quality_control/read_filtering/task_artic_guppyplex.wdl" as import "../../tasks/quality_control/read_filtering/task_nanoq.wdl" as nanoq_task import "../../tasks/quality_control/read_filtering/task_ncbi_scrub.wdl" as ncbi_scrub import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken2 -import "../../tasks/utilities/task_kmc.wdl" as kmc_task import "../../tasks/utilities/task_rasusa.wdl" as rasusa_task workflow read_QC_trim_ont { meta { - description: "Runs basic QC on Oxford Nanopore (ONT) reads with (1) fastq_scan, (2) nanoplot, (3) kmc, (4) rasusa downsampling, (5) tiptoft plasmid detection, and (6) nanoq filtering" + description: "Runs basic QC on Oxford Nanopore (ONT) reads with (1) fastq_scan, (2) nanoplot, (3) rasusa downsampling, (4) tiptoft plasmid detection, and (5) nanoq filtering" } input { String samplename File read1 - Int? genome_length + + # kmc has been observed to be unreliable for genome length estimation, so we are now using a fixed value + # setting this to be 5Mb which is around .7Mb greater than the mean genome length of bacteria (based on https://github.com/CDCgov/phoenix/blob/717d19c19338373fc0f89eba30757fe5cfb3e18a/assets/databases/NCBI_Assembly_stats_20240124.txt) + # this default will not be used for TheiaCoV as that workflow series pass in the expected length based on the organism tag + Int genome_length = 5000000 String? workflow_series @@ -96,14 +99,6 @@ workflow read_QC_trim_ont { } if ((call_kraken) && ! defined(kraken_db)) { String kraken_db_warning = "Kraken database not defined" } - # kmc for genome size estimation - call kmc_task.kmc { - input: - read1 = read1, - samplename = samplename - } - - Int kmc_est_genome_length = if kmc.est_genome_length > 10000000 then 10000000 else kmc.est_genome_length # rasusa for random downsampling call rasusa_task.rasusa { @@ -111,7 +106,7 @@ workflow read_QC_trim_ont { read1 = read1, samplename = samplename, coverage = downsampling_coverage, - genome_length = select_first([genome_length, kmc_est_genome_length]) + genome_length = genome_length } # tiptoft for plasmid detection call tiptoft_task.tiptoft { @@ -145,12 +140,9 @@ workflow read_QC_trim_ont { File? kraken_report_dehosted = kraken2_recalculate_abundances_dehosted.kraken_report String kraken_database = select_first([kraken2_raw.database, kraken2_se.kraken2_database, kraken_db_warning, ""]) - # theiaprok outputs - # kmc outputs - Int? est_genome_length = kmc_est_genome_length - File? kmc_kmer_stats = kmc.kmer_stats - String? kmc_version = kmc.kmc_version - + # estimated genome length -- by default for TheiaProk this is 5Mb + Int est_genome_length = genome_length + # nanoq outputs File read1_clean = select_first([nanoq.filtered_read1, read_filtering.read1_clean]) String? nanoq_version = nanoq.version