diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index e3ddab449..df8fe2d40 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -91,6 +91,19 @@ "description": "General purpose filters used for filtering tnscope and tnhaplotyper", } +# Manta bcftools filters +MANTA_FILTER_SETTINGS = { + "low_pr_sr_count": { + "tag_value": 4, + "filter_name": "low_pr_sr_count", + "field": "FORMAT", + }, + "varcaller_name": "Manta", + "filter_type": "general", + "analysis_type": "tumor_only,tumor_normal", + "description": "Bcftools filters to set frequency and minimum read support for SV calls", +} + # Configuration for SVDB settings: SVDB_FILTER_SETTINGS = { "swegen_sv_freq": { diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index 2c7f0b04a..396c92e73 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -148,6 +148,10 @@ ] ), }, + "manta": { + "wgs_settings": "", + "tga_settings": "--exome", + }, "vardict": { "allelic_frequency": "0.001", "max_pval": "0.9", diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py index e79d5d0ad..f7bbbe99b 100644 --- a/BALSAMIC/models/params.py +++ b/BALSAMIC/models/params.py @@ -2,6 +2,7 @@ from typing import Optional from pydantic import BaseModel, ConfigDict +from BALSAMIC.constants.analysis import SequencingType class ParamsCommon(BaseModel): @@ -24,6 +25,18 @@ class ParamsCommon(BaseModel): picard_RG_tumor: str +class ParamsManta(BaseModel): + """This class defines the params settings used as constants in Manta rule. + + Attributes: + wgs_settings: str(required). parameters for Manta analysis for WGS + tga_settings: str(required). parameters for Manta analysis for TGA + """ + + wgs_settings: str + tga_settings: str + + class ParamsVardict(BaseModel): """This class defines the params settings used as constants in vardict rule. @@ -149,15 +162,20 @@ class BalsamicWorkflowConfig(BaseModel): Attributes: common: global params defined across all rules in balsamic workflow + manta: params used in the manta rules umicommon: global params defined across specific rules in UMI workflow vep: global params defined in the rule vep vardict: params defined in the rule vardict umiextract : params defined in the rule sentieon_umiextract umiconsensuscall: params defined in the rule sentieon_consensuscall tnscope_umi: params defined in the rule sentieon_tnscope_umi + + Functions: + - get_manta_settings: Return setting for manta rule """ common: ParamsCommon + manta: ParamsManta vardict: ParamsVardict vep: ParamsVEP umicommon: UMIParamsCommon @@ -165,6 +183,12 @@ class BalsamicWorkflowConfig(BaseModel): umiconsensuscall: UMIParamsConsensuscall tnscope_umi: UMIParamsTNscope + def get_manta_settings(self, sequencing_type) -> str: + """Return correct setting for manta rules depending on sequencing type.""" + if sequencing_type == SequencingType.WGS: + return self.manta.wgs_settings + return self.manta.tga_settings + class VCFAttributes(BaseModel): """General purpose filter to manage various VCF attributes @@ -205,6 +229,7 @@ class VarCallerFilter(BaseModel): swegen_sv_freq: VCFAttributes (optional); maximum swegen sv allele frequency loqusdb_clinical_snv_freq: VCFAttributes (optional); maximum loqusdb clinical snv allele frequency loqusdb_clinical_sv_freq: VCFAttributes (optional); maximum loqusdb clinical sv allele frequency + low_pr_sr_count: VCFAttributes (optional); minumum Manta variant read support varcaller_name: str (required); variant caller name filter_type: str (required); filter name for variant caller analysis_type: str (required); analysis type e.g. tumor_normal or tumor_only @@ -224,6 +249,7 @@ class VarCallerFilter(BaseModel): swegen_sv_freq: Optional[VCFAttributes] = None loqusdb_clinical_snv_freq: Optional[VCFAttributes] = None loqusdb_clinical_sv_freq: Optional[VCFAttributes] = None + low_pr_sr_count: Optional[VCFAttributes] = None varcaller_name: str filter_type: str analysis_type: str diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule index 32d0ae2fb..6215b5c16 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_sv_filter.rule @@ -23,9 +23,8 @@ rule bcftools_filter_sv_research: "Filtering merged research structural and copy number variants using bcftools for {params.case_name}" shell: """ -bcftools view --threads {threads} -f .,PASS,MaxDepth {input.vcf_sv_research} |\ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' {input.vcf_sv_research} |\ +bcftools view --threads {threads} -f .,PASS -O z -o {output.vcf_pass_svdb}; tabix -p vcf -f {output.vcf_pass_svdb}; @@ -56,10 +55,9 @@ rule bcftools_filter_sv_clinical: shell: """ bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_sv_clinical} |\ -bcftools view --threads {threads} -f .,PASS,MaxDepth |\ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f .,PASS,MaxDepth -O z -o {output.vcf_pass_svdb}; +bcftools view --threads {threads} -f .,PASS -O z -o {output.vcf_pass_svdb}; tabix -p vcf -f {output.vcf_pass_svdb}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_quality_filter.rule index d4cdcd5e1..586d44a71 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_quality_filter.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_quality_filter.rule @@ -1,3 +1,4 @@ + rule bcftools_quality_filter_svdb: input: vcf_svdb = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.vcf.gz", @@ -15,7 +16,7 @@ rule bcftools_quality_filter_svdb: "Filtering merged research structural and copy number variants using bcftools for {params.case_name}" shell: """ -bcftools view --threads {threads} -f .,PASS,MaxDepth -o {output.vcf_pass_svdb_research} -O z {input.vcf_svdb}; +bcftools view --threads {threads} -f .,PASS -o {output.vcf_pass_svdb_research} -O z {input.vcf_svdb}; tabix -p vcf -f {output.vcf_pass_svdb_research}; - """ \ No newline at end of file + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule index ff79d5805..cab852304 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal.rule @@ -16,11 +16,14 @@ rule manta_tumor_normal: Path(singularity_image, config["bioinfo_tools"].get("manta") + ".sif").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + settings = params.get_manta_settings(sequencing_type=sequencing_type), runmode = "local", tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), normal = config_model.get_sample_name_by_type(SampleType.NORMAL), case_name = case_id, - manta_install_path = "/opt/conda/share/manta-1.6.0-2" + manta_install_path = "/opt/conda/share/manta-1.6.0-2", + low_pr_sr_count_value = MANTA_FILTERS.low_pr_sr_count.tag_value, + low_pr_sr_count_filter_name = MANTA_FILTERS.low_pr_sr_count.filter_name, threads: get_threads(cluster_config, "manta_tumor_normal") message: @@ -31,6 +34,7 @@ rule manta_tumor_normal: samtools_path=$(readlink -f $(which samtools)) configManta.py \ +{params.settings} \ --normalBam={input.bamN} \ --tumorBam={input.bamT} \ --referenceFasta={input.fa} \ @@ -43,7 +47,9 @@ python {params.tmpdir}/runWorkflow.py -m {params.runmode} -j {threads}; {input.fa} \ {params.tmpdir}/results/variants/somaticSV.vcf.gz > {params.tmpdir}/results/variants/somaticSV_converted.vcf; -bgzip -l 9 -c {params.tmpdir}/results/variants/somaticSV_converted.vcf > {output.final}; +bgzip -l 9 {params.tmpdir}/results/variants/somaticSV_converted.vcf ; + +bcftools filter --threads {threads} --exclude 'SUM(FORMAT/PR[0:1]+FORMAT/SR[0:1]) < {params.low_pr_sr_count_value}' --soft-filter '{params.low_pr_sr_count_filter_name}' --mode '+' -o {output.final} -O z {params.tmpdir}/results/variants/somaticSV_converted.vcf.gz tabix -p vcf -f {output.final}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule index cb5b860f7..fa2bc7c9b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only.rule @@ -15,10 +15,12 @@ rule manta_tumor_only: Path(singularity_image, config["bioinfo_tools"].get("manta") + ".sif").as_posix() params: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + settings = params.get_manta_settings(sequencing_type=sequencing_type), runmode = "local", tumor = config_model.get_sample_name_by_type(SampleType.TUMOR), case_name = config["analysis"]["case_id"], - manta_install_path= "/opt/conda/share/manta-1.6.0-2" + manta_install_path= "/opt/conda/share/manta-1.6.0-2", + low_pr_sr_count = [MANTA_FILTERS.low_pr_sr_count.tag_value,MANTA_FILTERS.low_pr_sr_count.filter_name], threads: get_threads(cluster_config, "manta_tumor_only") message: @@ -29,6 +31,7 @@ rule manta_tumor_only: samtools_path=$(readlink -f $(which samtools)) configManta.py \ +{params.settings} \ --tumorBam={input.bamT} \ --referenceFasta={input.fa} \ --runDir={params.tmpdir}; @@ -40,7 +43,9 @@ python {params.tmpdir}/runWorkflow.py -m {params.runmode} -j {threads}; {input.fa} \ {params.tmpdir}/results/variants/tumorSV.vcf.gz > {params.tmpdir}/results/variants/tumorSV_converted.vcf; -bgzip -l 9 -c {params.tmpdir}/results/variants/tumorSV_converted.vcf > {output.final}; +bgzip -l 9 {params.tmpdir}/results/variants/tumorSV_converted.vcf; + +bcftools filter --threads {threads} --exclude 'SUM(FORMAT/PR[0:1]+FORMAT/SR[0:1]) < {params.low_pr_sr_count[0]}' --soft-filter '{params.low_pr_sr_count[1]}' --mode '+' -o {output.final} -O z {params.tmpdir}/results/variants/tumorSV_converted.vcf.gz tabix -p vcf -f {output.final}; diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 25a1d6aee..57c414ca1 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -17,6 +17,7 @@ from BALSAMIC.constants.variant_filters import ( SENTIEON_VARCALL_SETTINGS, SVDB_FILTER_SETTINGS, VARDICT_SETTINGS, + MANTA_FILTER_SETTINGS, ) from BALSAMIC.constants.workflow_params import VARCALL_PARAMS, WORKFLOW_PARAMS, SLEEP_BEFORE_START from BALSAMIC.models.config import ConfigModel @@ -112,6 +113,19 @@ else: status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample +# Varcaller filter settings +COMMON_FILTERS = VarCallerFilter.model_validate(COMMON_SETTINGS) +VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS) +SENTIEON_CALLER = VarCallerFilter.model_validate(SENTIEON_VARCALL_SETTINGS) +SVDB_FILTERS = VarCallerFilter.model_validate(SVDB_FILTER_SETTINGS) +MANTA_FILTERS = VarCallerFilter.model_validate(MANTA_FILTER_SETTINGS) + +# Fastp parameters +fastp_parameters: Dict = get_fastp_parameters(config_model) + +# parse parameters as constants to workflows +params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) + # vcfanno annotations research_annotations.append( { 'annotation': [{ @@ -201,19 +215,6 @@ if "swegen_sv_frequency" in config["reference"]: swegen_sv: str = get_swegen_sv(config) - -# Varcaller filter settings -COMMON_FILTERS = VarCallerFilter.model_validate(COMMON_SETTINGS) -VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS) -SENTIEON_CALLER = VarCallerFilter.model_validate(SENTIEON_VARCALL_SETTINGS) -SVDB_FILTERS = VarCallerFilter.model_validate(SVDB_FILTER_SETTINGS) - -# Fastp parameters -fastp_parameters: Dict = get_fastp_parameters(config_model) - -# parse parameters as constants to workflows -params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) - # Capture kit name if config["analysis"]["sequencing_type"] != "wgs": capture_kit = os.path.split(config["panel"]["capture_kit"])[1] diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0accbe095..2364950be 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,15 @@ +[X.X.X] +------- + +Added: +^^^^^^ +* bcftools filters for PR:SR evidence in Manta calls +* "--exome" argument to Manta runs in TGA cases + +Removed: +^^^^^^^^ +* Extra bcftools filters that allows MaxDepth filtered variants in the final SV VCF + [13.0.1] ------- diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index 2a14b35db..c32a8b2e8 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -58,6 +58,7 @@ The copy number variants, identified using ascatNgs and `dellycnv`, are converte Tumor and normal calls in `TIDDIT` are merged using `SVDB` with `--bnd_distance 500` and `--overlap = 0.80`. Using a custom made script "filter_SVs.py", soft-filters are added to the calls based on the presence of the variant in the normal, with the goal of retaining only somatic variants as PASS. +Manta calls are filtered using bcftools to only keep variants that have evidence from 3 or more reads. .. list-table:: SV filters :widths: 25 25 40 @@ -78,6 +79,9 @@ Using a custom made script "filter_SVs.py", soft-filters are added to the calls * - TIDDIT - in_normal - ctg_n == True and AF_N_MAX == 0 and AF_T_MAX <= 0.25 + * - Manta + - low_pr_sr_count + - SUM(FORMAT/PR[0:1]+FORMAT/SR[0:1]) < 4.0 Further information regarding the TIDDIT tumor normal filtration: As translocation variants are represented by 2 BNDs in the VCF which allows for mixed assignment of soft-filters, a requirement for assigning soft-filters to translocations is that neither BND is PASS. @@ -138,7 +142,7 @@ The following filter applies for both tumor-normal and tumor-only samples in add Frq <= 0.02 (or) Frq == "." -The variants scored as `PASS` or `MaxDepth` are included in the final vcf file (`SNV.somatic..svdb..filtered.pass.vcf.gz`). +The variants scored as `PASS` are included in the final vcf file (`SNV.somatic..svdb..filtered.pass.vcf.gz`). The following command can be used to fetch the variants identified by a specific caller from merged structural and copy number variants. diff --git a/tests/models/test_params_models.py b/tests/models/test_params_models.py index 50dd10b52..94244602e 100644 --- a/tests/models/test_params_models.py +++ b/tests/models/test_params_models.py @@ -4,8 +4,12 @@ import pytest from pydantic import ValidationError +from BALSAMIC.constants.analysis import SequencingType +from BALSAMIC.constants.workflow_params import WORKFLOW_PARAMS +from BALSAMIC.models.params import BalsamicWorkflowConfig from BALSAMIC.models.config import VarcallerAttribute from BALSAMIC.models.params import ( + ParamsManta, ParamsVardict, ParamsVEP, QCModel, @@ -18,6 +22,45 @@ ) +def test_params_manta(): + """Test Manta settings model for correct validation.""" + + # GIVEN Manta params + test_manta_params = {"wgs_settings": "", "tga_settings": "--exome"} + + # WHEN building the model + test_manta_built = ParamsManta(**test_manta_params) + + # THEN string values should be correctly populated into the model + assert test_manta_built.tga_settings == "--exome" + assert test_manta_built.wgs_settings == "" + + +def test_get_manta_settings_tga(): + """Test get Manta settings based on sequencing type TGA.""" + + # GIVEN workflow params + params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) + + # WHEN getting manta settings for TGA + manta_settings = params.get_manta_settings(SequencingType.TARGETED) + + # THEN manta setting should specify exome argument + assert manta_settings == "--exome" + + +def test_get_manta_settings_wgs(): + """Test get Manta settings based on sequencing type WGS.""" + # GIVEN workflow params + params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) + + # WHEN getting manta settings for WGS + manta_settings = params.get_manta_settings(SequencingType.WGS) + + # THEN manta setting should be empty + assert manta_settings == "" + + def test_params_vardict(): """test UMIParamsVardict model for correct validation""" diff --git a/tests/test_data/config.json b/tests/test_data/config.json index a63bff715..507aa8aef 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -90,8 +90,8 @@ "single" ], "sequencing_type": [ - "targeted", - "wgs" + "wgs", + "targeted" ], "workflow_solution": [ "BALSAMIC"