diff --git a/.travis.yml b/.travis.yml index 3304f74bac2..d4293365916 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,8 @@ env: - TEST_TYPE=unit TEST_DOCKER=true TEST_VERBOSITY=minimal - TEST_TYPE=variantcalling TEST_DOCKER=true TEST_VERBOSITY=minimal - TEST_TYPE=python TEST_DOCKER=true TEST_VERBOSITY=minimal - - RUN_CNV_GERMLINE_WDL=true + - RUN_CNV_GERMLINE_COHORT_WDL=true + - RUN_CNV_GERMLINE_CASE_WDL=true - RUN_CNV_SOMATIC_WDL=true - RUN_M2_WDL=true - RUN_CNN_WDL=true @@ -100,7 +101,7 @@ before_install: sudo Rscript scripts/docker/gatkbase/install_R_packages.R; fi # Download Cromwell jar -- if you change the version, please change the CROMWELL_JAR env variable above, too. -- if [[ $RUN_CNV_GERMLINE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then +- if [[ $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then wget -O $CROMWELL_JAR https://github.com/broadinstitute/cromwell/releases/download/30.2/cromwell-30.2.jar; fi # Download git lfs files @@ -116,7 +117,7 @@ install: else ./gradlew assemble; ./gradlew installDist; - if [[ $RUN_CNV_GERMLINE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then + if [[ $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then echo "building a shadow jar for the wdl"; ./gradlew shadowJar; elif [[ $TEST_TYPE == cloud ]]; then @@ -131,9 +132,12 @@ script: echo "Not running any tests for nightly builds"; elif [[ $TRAVIS_SECURE_ENV_VARS == false && $TEST_TYPE == cloud ]]; then echo "Can't run cloud tests without keys so don't run tests"; - elif [[ $RUN_CNV_GERMLINE_WDL == true ]]; then - echo "Running CNV germline workflows"; - bash scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh; + elif [[ $RUN_CNV_GERMLINE_COHORT_WDL == true ]]; then + echo "Running CNV germline cohort workflow"; + travis_wait 60 bash scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh COHORT; + elif [[ $RUN_CNV_GERMLINE_CASE_WDL == true ]]; then + echo "Running CNV germline case workflow"; + travis_wait 60 bash scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh CASE; elif [[ $RUN_CNV_SOMATIC_WDL == true ]]; then echo "Running CNV somatic workflows"; bash scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh; diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json new file mode 100644 index 00000000000..0b03bf35dd5 --- /dev/null +++ b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json @@ -0,0 +1,35 @@ +{ + "CNVGermlineCaseScatteredWorkflow.normal_bams": [ + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam", + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam" + ], + "CNVGermlineCaseScatteredWorkflow.normal_bais": [ + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam.bai", + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam.bai"], + "CNVGermlineCaseScatteredWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz", + "CNVGermlineCaseScatteredWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVGermlineCaseScatteredWorkflow.allosomal_contigs": ["X", "Y"], + "CNVGermlineCaseScatteredWorkflow.ref_copy_number_autosomal_contigs": 2, + "CNVGermlineCaseScatteredWorkflow.gcnv_disable_annealing": true, + "CNVGermlineCaseScatteredWorkflow.gcnv_initial_temperature": 1.0, + "CNVGermlineCaseScatteredWorkflow.gcnv_max_training_epochs": 1, + "CNVGermlineCaseScatteredWorkflow.gcnv_min_training_epochs": 1, + "CNVGermlineCaseScatteredWorkflow.gcnv_model_tars": [ + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz", + "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"], + "CNVGermlineCaseScatteredWorkflow.gcnv_num_thermal_advi_iters": 1, + "CNVGermlineCaseScatteredWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list", + "CNVGermlineCaseScatteredWorkflow.num_intervals_per_scatter": 30, + "CNVGermlineCaseScatteredWorkflow.num_samples_per_scatter_block": 2, + "CNVGermlineCaseScatteredWorkflow.gcnv_max_advi_iter_first_epoch": 10, + "CNVGermlineCaseScatteredWorkflow.gcnv_log_emission_sampling_rounds": 1, + "CNVGermlineCaseScatteredWorkflow.gcnv_log_emission_samples_per_round": 1, + "CNVGermlineCaseScatteredWorkflow.gcnv_max_advi_iter_subsequent_epochs": 1, + "CNVGermlineCaseScatteredWorkflow.gcnv_max_copy_number": 3, + "CNVGermlineCaseScatteredWorkflow.gcnv_max_calling_iters": 1, + "CNVGermlineCaseScatteredWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", + "CNVGermlineCaseScatteredWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", + "CNVGermlineCaseScatteredWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" +} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_do-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_do-gc_workflow.json deleted file mode 100644 index 9943295d914..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_do-gc_workflow.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "CNVGermlineCaseWorkflow.bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineCaseWorkflow.bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineCaseWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz", - "CNVGermlineCaseWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCaseWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCaseWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCaseWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCaseWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCaseWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_model_tars": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"], - "CNVGermlineCaseWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCaseWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list", - "CNVGermlineCaseWorkflow.num_intervals_per_scatter": "100", - "CNVGermlineCaseWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCaseWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCaseWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_no-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_no-gc_workflow.json deleted file mode 100644 index d6bd39228c2..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wes_no-gc_workflow.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "CNVGermlineCaseWorkflow.bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineCaseWorkflow.bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineCaseWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz", - "CNVGermlineCaseWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCaseWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCaseWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCaseWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCaseWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCaseWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_model_tars": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-no-gc-gcnv-model-0.tar.gz", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-no-gc-gcnv-model-1.tar.gz"], - "CNVGermlineCaseWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCaseWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list", - "CNVGermlineCaseWorkflow.num_intervals_per_scatter": "100", - "CNVGermlineCaseWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCaseWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCaseWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_do-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_do-gc_workflow.json deleted file mode 100644 index 674f5941e53..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_do-gc_workflow.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "CNVGermlineCaseWorkflow.bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineCaseWorkflow.bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineCaseWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-contig-ploidy-model.tar.gz", - "CNVGermlineCaseWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCaseWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCaseWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCaseWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCaseWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCaseWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_model_tars": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-do-gc-gcnv-model-0.tar.gz", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-do-gc-gcnv-model-1.tar.gz"], - "CNVGermlineCaseWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCaseWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/chr20xy.interval_list", - "CNVGermlineCaseWorkflow.num_intervals_per_scatter": "50", - "CNVGermlineCaseWorkflow.bin_length": "20000", - "CNVGermlineCaseWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCaseWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCaseWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_no-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_no-gc_workflow.json deleted file mode 100644 index c9fdfe17362..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_wgs_no-gc_workflow.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "CNVGermlineCaseWorkflow.bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineCaseWorkflow.bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineCaseWorkflow.contig_ploidy_model_tar": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-contig-ploidy-model.tar.gz", - "CNVGermlineCaseWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCaseWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCaseWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCaseWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCaseWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCaseWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCaseWorkflow.gcnv_model_tars": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-no-gc-gcnv-model-0.tar.gz", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wgs-no-gc-gcnv-model-1.tar.gz"], - "CNVGermlineCaseWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCaseWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/chr20xy.interval_list", - "CNVGermlineCaseWorkflow.num_intervals_per_scatter": "50", - "CNVGermlineCaseWorkflow.bin_length": "20000", - "CNVGermlineCaseWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCaseWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCaseWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_no-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_no-gc_workflow.json deleted file mode 100644 index fdb6a06883b..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_no-gc_workflow.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.cohort_entity_id": "wes-no-gc", - "CNVGermlineCohortWorkflow.contig_ploidy_priors": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/contig_ploidy_priors_chr20xy.tsv", - "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCohortWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCohortWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCohortWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCohortWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCohortWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCohortWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list", - "CNVGermlineCohortWorkflow.normal_bais": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam.bai"], - "CNVGermlineCohortWorkflow.normal_bams": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam"], - "CNVGermlineCohortWorkflow.num_intervals_per_scatter": "100", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_do-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_do-gc_workflow.json deleted file mode 100644 index c4fdcabc570..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_do-gc_workflow.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.cohort_entity_id": "wgs-do-gc", - "CNVGermlineCohortWorkflow.contig_ploidy_priors": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/contig_ploidy_priors_chr20xy.tsv", - "CNVGermlineCohortWorkflow.do_explicit_gc_correction": "true", - "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCohortWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCohortWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCohortWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCohortWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCohortWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCohortWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/chr20xy.interval_list", - "CNVGermlineCohortWorkflow.normal_bais": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam.bai"], - "CNVGermlineCohortWorkflow.normal_bams": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam"], - "CNVGermlineCohortWorkflow.num_intervals_per_scatter": "50", - "CNVGermlineCohortWorkflow.bin_length": "20000", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_no-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_no-gc_workflow.json deleted file mode 100644 index 625f31766e0..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wgs_no-gc_workflow.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.cohort_entity_id": "wgs-no-gc", - "CNVGermlineCohortWorkflow.contig_ploidy_priors": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/contig_ploidy_priors_chr20xy.tsv", - "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNVGermlineCohortWorkflow.allosomal_contigs": ["X", "Y"], - "CNVGermlineCohortWorkflow.ref_copy_number_autosomal_contigs": 2, - "CNVGermlineCohortWorkflow.gcnv_disable_annealing": true, - "CNVGermlineCohortWorkflow.gcnv_initial_temperature": 1.0, - "CNVGermlineCohortWorkflow.gcnv_max_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_min_training_epochs": 1, - "CNVGermlineCohortWorkflow.gcnv_num_thermal_advi_iters": 1, - "CNVGermlineCohortWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/chr20xy.interval_list", - "CNVGermlineCohortWorkflow.normal_bais": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam.bai", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam.bai"], - "CNVGermlineCohortWorkflow.normal_bams": [ - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam"], - "CNVGermlineCohortWorkflow.num_intervals_per_scatter": "50", - "CNVGermlineCohortWorkflow.bin_length": "20000", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" -} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_do-gc_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow.json similarity index 95% rename from scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_do-gc_workflow.json rename to scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow.json index e8541cf9952..c1611e2fd63 100644 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_wes_do-gc_workflow.json +++ b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow.json @@ -19,7 +19,8 @@ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74NEG_20xy-downsampled.bam", "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P2T_20xy-downsampled.bam", "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/SM-74P35_20xy-downsampled.bam"], - "CNVGermlineCohortWorkflow.num_intervals_per_scatter": "100", + "CNVGermlineCohortWorkflow.num_intervals_per_scatter": 30, + "CNVGermlineCohortWorkflow.gcnv_max_copy_number": 3, "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.dict", "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta.fai", "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/Homo_sapiens_assembly19.truncated.fasta" diff --git a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh b/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh index 4122c6c349f..e31c712330b 100644 --- a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh +++ b/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh @@ -1,10 +1,20 @@ #!/bin/bash -l set -e + +MODE=$1 +# We split up the test into CASE in COHORT to reduce overall travis runtime +if [[ "$MODE" != "COHORT" ]] && [[ "$MODE" != "CASE" ]]; then + echo "First argument to this scripts needs to be COHORT or CASE" + exit 1 +fi + + #cd in the directory of the script in order to use relative paths script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) cd "$script_path" ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/cnv_common_tasks.wdl +ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl WORKING_DIR=/home/travis/build/broadinstitute @@ -25,33 +35,16 @@ popd echo "Inserting docker image into json ========" CNV_CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/cnv_cromwell_tests/germline/" -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_wes_no-gc_workflow.json >cnv_germline_cohort_wes_no-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_wgs_no-gc_workflow.json >cnv_germline_cohort_wgs_no-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_wes_do-gc_workflow.json >cnv_germline_cohort_wes_do-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_wgs_do-gc_workflow.json >cnv_germline_cohort_wgs_do-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_case_wes_no-gc_workflow.json >cnv_germline_case_wes_no-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_case_wgs_no-gc_workflow.json >cnv_germline_case_wgs_no-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_case_wes_do-gc_workflow.json >cnv_germline_case_wes_do-gc_workflow_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_case_wgs_do-gc_workflow.json >cnv_germline_case_wgs_do-gc_workflow_mod.json - +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_workflow.json >cnv_germline_cohort_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_case_scattered_workflow.json >cnv_germline_case_scattered_workflow_mod.json echo "Running ========" -# We disable some tests to reduce runtime on Travis - -# Cohort WES -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl -i cnv_germline_cohort_wes_no-gc_workflow_mod.json -# Cohort WGS -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl -i cnv_germline_cohort_wgs_no-gc_workflow_mod.json # Cohort WES w/ explicit GC correction -java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl -i cnv_germline_cohort_wes_do-gc_workflow_mod.json -# Cohort WGS w/ explicit GC correction -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl -i cnv_germline_cohort_wgs_do-gc_workflow_mod.json - -# Case WES -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl -i cnv_germline_case_wes_no-gc_workflow_mod.json -# Case WGS -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl -i cnv_germline_case_wgs_no-gc_workflow_mod.json -# Case WES w/ explicit GC correction -java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl -i cnv_germline_case_wes_do-gc_workflow_mod.json -# Case WGS w/ explicit GC correction -#java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl -i cnv_germline_case_wgs_do-gc_workflow_mod.json \ No newline at end of file +if [[ "$MODE" == "COHORT" ]]; then + java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl -i cnv_germline_cohort_workflow_mod.json +fi + +# Scattered case WES w/ explicit GC correction +if [[ "$MODE" == "CASE" ]]; then + java -jar ${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl -i cnv_germline_case_scattered_workflow_mod.json +fi \ No newline at end of file diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl old mode 100755 new mode 100644 index 5172dd4251e..bc09133017f --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -242,6 +242,11 @@ task ScatterIntervals { } task PostprocessGermlineCNVCalls { + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + String entity_id Array[File] gcnv_calls_tars Array[File] gcnv_model_tars @@ -273,12 +278,21 @@ task PostprocessGermlineCNVCalls { export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} # untar calls to CALLS_0, CALLS_1, etc directories and build the command line + # also copy over shard config and interval files gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars}) + calling_configs_array=(${sep=" " calling_configs}) + denoising_configs_array=(${sep=" " denoising_configs}) + gcnvkernel_version_array=(${sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(${sep=" " sharded_interval_lists}) calls_args="" for index in ${dollar}{!gcnv_calls_tar_array[@]}; do gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]} - mkdir CALLS_$index - tar xzf $gcnv_calls_tar -C CALLS_$index + mkdir -p CALLS_$index/SAMPLE_${sample_index} + tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_${sample_index} + cp ${dollar}{calling_configs_array[$index]} CALLS_$index/ + cp ${dollar}{denoising_configs_array[$index]} CALLS_$index/ + cp ${dollar}{gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${dollar}{sharded_interval_lists_array[$index]} CALLS_$index/ calls_args="$calls_args --calls-shard-path CALLS_$index" done diff --git a/scripts/cnv_wdl/germline/README.md b/scripts/cnv_wdl/germline/README.md index 645ed3aebf8..1e036e1a7ff 100644 --- a/scripts/cnv_wdl/germline/README.md +++ b/scripts/cnv_wdl/germline/README.md @@ -2,8 +2,9 @@ ### Which WDL should you use? -- Calling a cohort of samples and building a model for denoising further case samples: ``cnv_germline_cohort_workflow.wdl`` -- Calling a case sample using a previously built model for denoising: ``cnv_germline_case_workflow.wdl`` +- Cohort WDL: Calling a cohort of samples and building a model for denoising further case samples: ``cnv_germline_cohort_workflow.wdl`` +- Case WDL: Calling case samples using a previously built model for denoising: ``cnv_germline_case_workflow.wdl`` +- Scattered case WDL (recommended): Functionally equivalent to case WDL, written for reducing cloud compute cost (see below) and wall-clock time ``cnv_germline_case_scattered_workflow.wdl`` #### Setting up parameter json file for a run @@ -38,8 +39,8 @@ Further explanation of other task-level parameters may be found by invoking the The reference, number of intervals per scatter, and bins (if specified) must be the same between cohort and case samples. -- ``CNVGermlineCaseWorkflow.bam`` -- Path to case BAM file. -- ``CNVGermlineCaseWorkflow.bam_idx`` -- Path to case BAM file index. +- ``CNVGermlineCohortWorkflow.normal_bais`` -- List of BAI files. This list must correspond to `normal_bams`. For example, `["Sample1.bai", "Sample2.bai"]`. +- ``CNVGermlineCohortWorkflow.normal_bams`` -- List of BAM files. This list must correspond to `normal_bais`. For example, `["Sample1.bam", "Sample2.bam"]`. - ``CNVGermlineCaseWorkflow.contig_ploidy_model_tar`` -- Path to tar of the contig-ploidy model directory generated by the DetermineGermlineContigPloidyCohortMode task. - ``CNVGermlineCaseWorkflow.gatk_docker`` -- GATK Docker image (e.g., ``broadinstitute/gatk:latest``). - ``CNVGermlineCaseWorkflow.gcnv_model_tars`` -- Array of paths to tars of the contig-ploidy model directories generated by the GermlineCNVCallerCohortMode tasks. @@ -52,3 +53,18 @@ The reference, number of intervals per scatter, and bins (if specified) must be In additional, there are several task-level parameters that may be set by advanced users as above. Further explanation of these task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool. + +#### Required parameters in the scattered germline case workflow + +Same required parameters as in the germline case workflow. However, in order to reduce wall-clock time and compute cost, it is recommended to optimize for the following parameters: + +- ``CNVGermlineCaseScatteredWorkflow.num_samples_per_scatter_block`` -- (recommended WES value=25) number of samples to process in a single block; blocks of this size will be sent to the germline case workflow and processed in a batch; +- ``CNVGermlineCaseScatteredWorkflow.preemptible_attempts`` -- (recommended value=5) this reduces cost by using preemptible instances +- ``CNVGermlineCaseScatteredWorkflow.mem_gb_for_determine_germline_contig_ploidy`` -- amount of memory allotted for ploidy determination tasks (the lower the cheaper) +- ``CNVGermlineCaseScatteredWorkflow.cpu_for_determine_germline_contig_ploidy`` -- number of CPU cores allotted for ploidy determination tasks (the lower the cheaper) +- ``CNVGermlineCaseScatteredWorkflow.disk_for_determine_germline_contig_ploidy`` -- amount of storage allotted for ploidy determination tasks (the lower the cheaper) +- ``CNVGermlineCaseScatteredWorkflow.mem_gb_for_germline_cnv_caller`` -- amount of memory allotted for gCNV caller tasks (the lower the cheaper) +- ``CNVGermlineCaseScatteredWorkflow.cpu_for_germline_cnv_caller`` -- number of CPU cores allotted for gCNV caller tasks (the lower the cheaper) +- ``CNVGermlineCaseScatteredWorkflow.disk_for_germline_cnv_caller`` -- amount of storage allotted for gCNV caller tasks (the lower the cheaper) + +Note that lowering disk and memory too much will eventually lead to the workflow failing. Lowering the number of CPU cores could increase the wall-clock times. diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl new file mode 100644 index 00000000000..ce32bead65a --- /dev/null +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -0,0 +1,233 @@ +# +# A wrapper for the gCNV case workflow intended for lowering computing cost by making it feasible to use +# preemptible cloud instances with low memory requirements. CPU, memory and disk requirements can be +# lowered for GermlineCNVCaller and DetermineGermlineContigPloidy tasks. +# +# +# - Example invocation: +# +# java -jar cromwell.jar run cnv_germline_case_scattered_workflow.wdl -i my_parameters.json +# +#################### + +import "cnv_germline_case_workflow.wdl" as GermlineCNVCaseWorkflow + +workflow CNVGermlineCaseScatteredWorkflow { + + ################################## + #### required basic arguments #### + ################################## + File intervals + Array[String]+ normal_bams + Array[String]+ normal_bais + File contig_ploidy_model_tar + Array[File]+ gcnv_model_tars + Int num_intervals_per_scatter + File ref_fasta_dict + File ref_fasta_fai + File ref_fasta + String gatk_docker + Int num_samples_per_scatter_block + + ################################## + #### optional basic arguments #### + ################################## + File? gatk4_jar_override + Int? preemptible_attempts + + #################################################### + #### optional arguments for PreprocessIntervals #### + #################################################### + Int? padding + Int? bin_length + + ############################################## + #### optional arguments for CollectCounts #### + ############################################## + String? collect_counts_format + Int? mem_gb_for_collect_counts + + ###################################################################### + #### optional arguments for DetermineGermlineContigPloidyCaseMode #### + ###################################################################### + Float? ploidy_mapping_error_rate + Float? ploidy_sample_psi_scale + Int? mem_gb_for_determine_germline_contig_ploidy + Int? cpu_for_determine_germline_contig_ploidy + Int? disk_for_determine_germline_contig_ploidy + + ########################################################## + #### optional arguments for GermlineCNVCallerCaseMode #### + ########################################################## + Float? gcnv_p_alt + Float? gcnv_cnv_coherence_length + Int? gcnv_max_copy_number + Int? mem_gb_for_germline_cnv_caller + Int? cpu_for_germline_cnv_caller + Int? disk_for_germline_cnv_caller + + # optional arguments for germline CNV denoising model + Float? gcnv_mapping_error_rate + Float? gcnv_sample_psi_scale + Float? gcnv_depth_correction_tau + String? gcnv_copy_number_posterior_expectation_mode + Int? gcnv_active_class_padding_hybrid_mode + + # optional arguments for Hybrid ADVI + Float? gcnv_learning_rate + Float? gcnv_adamax_beta_1 + Float? gcnv_adamax_beta_2 + Int? gcnv_log_emission_samples_per_round + Float? gcnv_log_emission_sampling_median_rel_error + Int? gcnv_log_emission_sampling_rounds + Int? gcnv_max_advi_iter_first_epoch + Int? gcnv_max_advi_iter_subsequent_epochs + Int? gcnv_min_training_epochs + Int? gcnv_max_training_epochs + Float? gcnv_initial_temperature + Int? gcnv_num_thermal_advi_iters + Int? gcnv_convergence_snr_averaging_window + Float? gcnv_convergence_snr_trigger_threshold + Int? gcnv_convergence_snr_countdown_window + Int? gcnv_max_calling_iters + Float? gcnv_caller_update_convergence_threshold + Float? gcnv_caller_internal_admixing_rate + Float? gcnv_caller_external_admixing_rate + Boolean? gcnv_disable_annealing + + ################################################### + #### arguments for PostprocessGermlineCNVCalls #### + ################################################### + Int ref_copy_number_autosomal_contigs + Array[String]? allosomal_contigs + + call SplitInputArray as SplitInputBamsList { + input: + input_array = normal_bams, + num_inputs_in_scatter_block = num_samples_per_scatter_block, + gatk_docker = gatk_docker + } + + call SplitInputArray as SplitInputBaisList { + input: + input_array = normal_bais, + num_inputs_in_scatter_block = num_samples_per_scatter_block, + gatk_docker = gatk_docker + } + + Array[Array[String]] split_bams = SplitInputBamsList.split_array + Array[Array[String]] split_bais = SplitInputBaisList.split_array + + scatter (subarray_index in range(length(split_bams))) { + call GermlineCNVCaseWorkflow.CNVGermlineCaseWorkflow { + input: + intervals = intervals, + normal_bams = split_bams[subarray_index], + normal_bais = split_bais[subarray_index], + contig_ploidy_model_tar = contig_ploidy_model_tar, + gcnv_model_tars = gcnv_model_tars, + num_intervals_per_scatter = num_intervals_per_scatter, + ref_fasta_dict = ref_fasta_dict, + ref_fasta_fai = ref_fasta_fai, + ref_fasta = ref_fasta, + gatk_docker = gatk_docker, + gatk4_jar_override = gatk4_jar_override, + preemptible_attempts = preemptible_attempts, + padding = padding, + bin_length = bin_length, + collect_counts_format = collect_counts_format, + mem_gb_for_collect_counts = mem_gb_for_collect_counts, + ploidy_mapping_error_rate = ploidy_mapping_error_rate, + ploidy_sample_psi_scale = ploidy_sample_psi_scale, + mem_gb_for_determine_germline_contig_ploidy = mem_gb_for_determine_germline_contig_ploidy, + cpu_for_determine_germline_contig_ploidy = cpu_for_determine_germline_contig_ploidy, + disk_for_determine_germline_contig_ploidy = disk_for_determine_germline_contig_ploidy, + gcnv_p_alt = gcnv_p_alt, + gcnv_cnv_coherence_length = gcnv_cnv_coherence_length, + gcnv_max_copy_number = gcnv_max_copy_number, + mem_gb_for_germline_cnv_caller = mem_gb_for_germline_cnv_caller, + cpu_for_germline_cnv_caller = cpu_for_germline_cnv_caller, + disk_for_germline_cnv_caller = disk_for_germline_cnv_caller, + gcnv_mapping_error_rate = gcnv_mapping_error_rate, + gcnv_sample_psi_scale = gcnv_sample_psi_scale, + gcnv_depth_correction_tau = gcnv_depth_correction_tau, + gcnv_copy_number_posterior_expectation_mode = gcnv_copy_number_posterior_expectation_mode, + gcnv_active_class_padding_hybrid_mode = gcnv_active_class_padding_hybrid_mode, + gcnv_learning_rate = gcnv_learning_rate, + gcnv_adamax_beta_1 = gcnv_adamax_beta_1, + gcnv_adamax_beta_2 = gcnv_adamax_beta_2, + gcnv_log_emission_samples_per_round = gcnv_log_emission_samples_per_round, + gcnv_log_emission_sampling_median_rel_error = gcnv_log_emission_sampling_median_rel_error, + gcnv_log_emission_sampling_rounds = gcnv_log_emission_sampling_rounds, + gcnv_max_advi_iter_first_epoch = gcnv_max_advi_iter_first_epoch, + gcnv_max_advi_iter_subsequent_epochs = gcnv_max_advi_iter_subsequent_epochs, + gcnv_min_training_epochs = gcnv_min_training_epochs, + gcnv_max_training_epochs = gcnv_max_training_epochs, + gcnv_initial_temperature = gcnv_initial_temperature, + gcnv_num_thermal_advi_iters = gcnv_num_thermal_advi_iters, + gcnv_convergence_snr_averaging_window = gcnv_convergence_snr_averaging_window, + gcnv_convergence_snr_trigger_threshold = gcnv_convergence_snr_trigger_threshold, + gcnv_convergence_snr_countdown_window = gcnv_convergence_snr_countdown_window, + gcnv_max_calling_iters = gcnv_max_calling_iters, + gcnv_caller_update_convergence_threshold = gcnv_caller_update_convergence_threshold, + gcnv_caller_internal_admixing_rate = gcnv_caller_internal_admixing_rate, + gcnv_caller_external_admixing_rate = gcnv_caller_external_admixing_rate, + gcnv_disable_annealing = gcnv_disable_annealing, + ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, + allosomal_contigs = allosomal_contigs + } + } + + output { + Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar + Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars + Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf + Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf + } +} + +task SplitInputArray { + Array[String] input_array + Int num_inputs_in_scatter_block + String gatk_docker + + Int machine_mem_mb = 4000 + Int disk_space_gb = 20 + Int cpu = 1 + Int? preemptible_attempts + Boolean use_ssd = false + + File input_array_file = write_lines(input_array) + + # This tasks takes as input an array of strings and number of columns (num_inputs_in_scatter_block) + # and outputs a 2-dimensional reshaped array with same contents and with width equal to num_inputs_in_scatter_block + # (with last row potentially having a smaller length than others) + command <<< + python <>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem_mb + " MB" + disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 8]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + Array[Array[String]] split_array = read_tsv("input_array_split.tsv") + } +} diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index 945fad8a173..9300adf1282 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -1,4 +1,5 @@ -# Workflow for running GATK GermlineCNVCaller on a single case sample. Supports both WGS and WES. +# Workflow for running GATK GermlineCNVCaller on multiple case samples using a trained model (obtained from running +# GATK GermlineCNVCaller in the cohort mode). Supports both WGS and WES. # # Notes: # @@ -29,8 +30,8 @@ workflow CNVGermlineCaseWorkflow { ################################## File intervals File? blacklist_intervals - File bam - File bam_idx + Array[String]+ normal_bams + Array[String]+ normal_bais File contig_ploidy_model_tar Array[File]+ gcnv_model_tars Int num_intervals_per_scatter @@ -51,6 +52,12 @@ workflow CNVGermlineCaseWorkflow { Int? padding Int? bin_length + ############################################## + #### optional arguments for CollectCounts #### + ############################################## + String? collect_counts_format + Int? mem_gb_for_collect_counts + ###################################################################### #### optional arguments for DetermineGermlineContigPloidyCaseMode #### ###################################################################### @@ -58,6 +65,7 @@ workflow CNVGermlineCaseWorkflow { Float? ploidy_sample_psi_scale Int? mem_gb_for_determine_germline_contig_ploidy Int? cpu_for_determine_germline_contig_ploidy + Int? disk_for_determine_germline_contig_ploidy ########################################################## #### optional arguments for GermlineCNVCallerCaseMode #### @@ -67,6 +75,7 @@ workflow CNVGermlineCaseWorkflow { Int? gcnv_max_copy_number Int? mem_gb_for_germline_cnv_caller Int? cpu_for_germline_cnv_caller + Int? disk_for_germline_cnv_caller # optional arguments for germline CNV denoising model Float? gcnv_mapping_error_rate @@ -103,6 +112,8 @@ workflow CNVGermlineCaseWorkflow { Int ref_copy_number_autosomal_contigs Array[String]? allosomal_contigs + Array[Pair[String, String]] normal_bams_and_bais = zip(normal_bams, normal_bais) + call CNVTasks.PreprocessIntervals { input: intervals = intervals, @@ -117,17 +128,21 @@ workflow CNVGermlineCaseWorkflow { preemptible_attempts = preemptible_attempts } - call CNVTasks.CollectCounts { - input: - intervals = PreprocessIntervals.preprocessed_intervals, - bam = bam, - bam_idx = bam_idx, - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - ref_fasta_dict = ref_fasta_dict, - gatk4_jar_override = gatk4_jar_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + scatter (normal_bam_and_bai in normal_bams_and_bais) { + call CNVTasks.CollectCounts { + input: + intervals = PreprocessIntervals.preprocessed_intervals, + bam = normal_bam_and_bai.left, + bam_idx = normal_bam_and_bai.right, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ref_fasta_dict = ref_fasta_dict, + format = collect_counts_format, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + mem_gb = mem_gb_for_collect_counts, + preemptible_attempts = preemptible_attempts + } } call DetermineGermlineContigPloidyCaseMode { @@ -138,6 +153,7 @@ workflow CNVGermlineCaseWorkflow { gatk_docker = gatk_docker, mem_gb = mem_gb_for_determine_germline_contig_ploidy, cpu = cpu_for_determine_germline_contig_ploidy, + disk_space_gb = disk_for_determine_germline_contig_ploidy, mapping_error_rate = ploidy_mapping_error_rate, sample_psi_scale = ploidy_sample_psi_scale, preemptible_attempts = preemptible_attempts @@ -195,29 +211,37 @@ workflow CNVGermlineCaseWorkflow { } } - call CNVTasks.PostprocessGermlineCNVCalls { - input: - entity_id = CollectCounts.entity_id, - gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, - gcnv_model_tars = gcnv_model_tars, - allosomal_contigs = allosomal_contigs, - ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, - contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, - sample_index = 0, - gatk4_jar_override = gatk4_jar_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) + + scatter (sample_index in range(length(normal_bams))) { + call CNVTasks.PostprocessGermlineCNVCalls { + input: + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, + entity_id = CollectCounts.entity_id[sample_index], + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], + gcnv_model_tars = gcnv_model_tars, + allosomal_contigs = allosomal_contigs, + ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, + sample_index = sample_index, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } } output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals - File read_counts_entity_id = CollectCounts.entity_id - File read_counts = CollectCounts.counts + Array[File] read_counts_entity_id = CollectCounts.entity_id + Array[File] read_counts = CollectCounts.counts File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar - File genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf - File genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf } } @@ -274,7 +298,7 @@ task DetermineGermlineContigPloidyCaseMode { memory: machine_mem_mb + " MB" disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 8]) - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { @@ -340,6 +364,7 @@ task GermlineCNVCallerCaseMode { # If optional output_dir not specified, use "out" String output_dir_ = select_first([output_dir, "out"]) + Int num_samples = length(read_count_files) command <<< set -e mkdir ${output_dir_} @@ -390,7 +415,11 @@ task GermlineCNVCallerCaseMode { --caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \ --disable-annealing ${default="false" disable_annealing} - tar czf case-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/case-calls . + CURRENT_SAMPLE=0 + while [ $CURRENT_SAMPLE -lt ${num_samples} ]; do + tar czf case-gcnv-shard-${scatter_index}-sample-$CURRENT_SAMPLE-gcnv-calls.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done tar czf case-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking . >>> @@ -399,11 +428,15 @@ task GermlineCNVCallerCaseMode { memory: machine_mem_mb + " MB" disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 8]) - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { - File gcnv_calls_tar = "case-gcnv-calls-${scatter_index}.tar.gz" + File calling_config_json = "${output_dir_}/case-calls/calling_config.json" + File denoising_config_json = "${output_dir_}/case-calls/denoising_config.json" + File gcnvkernel_version_json = "${output_dir_}/case-calls/gcnvkernel_version.json" + File sharded_interval_list = "${output_dir_}/case-calls/interval_list.tsv" + Array[File] gcnv_call_tars = glob("*-gcnv-calls.tar.gz") File gcnv_tracking_tar = "case-gcnv-tracking-${scatter_index}.tar.gz" } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 8be52144d5c..e35b7c5fe44 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -29,8 +29,8 @@ workflow CNVGermlineCohortWorkflow { ################################## File intervals File? blacklist_intervals - Array[String] normal_bams - Array[String] normal_bais + Array[String]+ normal_bams + Array[String]+ normal_bais String cohort_entity_id File contig_ploidy_priors Int num_intervals_per_scatter @@ -65,7 +65,7 @@ workflow CNVGermlineCohortWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## - String? format + String? collect_counts_format Int? mem_gb_for_collect_counts ######################################################################## @@ -173,7 +173,7 @@ workflow CNVGermlineCohortWorkflow { ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, - format = format, + format = collect_counts_format, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -259,11 +259,17 @@ workflow CNVGermlineCohortWorkflow { } } + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + scatter (sample_index in range(length(CollectCounts.entity_id))) { call CNVTasks.PostprocessGermlineCNVCalls { input: + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, @@ -282,7 +288,7 @@ workflow CNVGermlineCohortWorkflow { File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -420,6 +426,7 @@ task GermlineCNVCallerCohortMode { # If optional output_dir not specified, use "out" String output_dir_ = select_first([output_dir, "out"]) + Int num_samples = length(read_count_files) command <<< set -e @@ -480,7 +487,11 @@ task GermlineCNVCallerCohortMode { --disable-annealing ${default="false" disable_annealing} tar czf ${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model . - tar czf ${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls . + CURRENT_SAMPLE=0 + while [ $CURRENT_SAMPLE -lt ${num_samples} ]; do + tar czf ${cohort_entity_id}-shard-${scatter_index}-sample-$CURRENT_SAMPLE-gcnv-calls.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done tar czf ${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking . >>> @@ -494,7 +505,11 @@ task GermlineCNVCallerCohortMode { output { File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz" - File gcnv_calls_tar = "${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz" + File calling_config_json = "${output_dir_}/${cohort_entity_id}-calls/calling_config.json" + File denoising_config_json = "${output_dir_}/${cohort_entity_id}-calls/denoising_config.json" + File gcnvkernel_version_json = "${output_dir_}/${cohort_entity_id}-calls/gcnvkernel_version.json" + File sharded_interval_list = "${output_dir_}/${cohort_entity_id}-calls/interval_list.tsv" + Array[File] gcnv_call_tars = glob("*-gcnv-calls.tar.gz") File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz" } } diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl index 1c837b006b6..73c2be6f8a3 100644 --- a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl +++ b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl @@ -66,7 +66,7 @@ workflow CNVSomaticPairWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## - String? format + String? collect_counts_format Int? mem_gb_for_collect_counts ##################################################### @@ -165,7 +165,7 @@ workflow CNVSomaticPairWorkflow { ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, - format = format, + format = collect_counts_format, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -295,7 +295,7 @@ workflow CNVSomaticPairWorkflow { ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, - format = format, + format = collect_counts_format, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl index 0163477110a..8048808d79a 100644 --- a/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl +++ b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl @@ -66,7 +66,7 @@ workflow CNVSomaticPanelWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## - String? format + String? collect_counts_format Int? mem_gb_for_collect_counts ############################################################## @@ -125,7 +125,7 @@ workflow CNVSomaticPanelWorkflow { ref_fasta = ref_fasta, ref_fasta_fai = ref_fasta_fai, ref_fasta_dict = ref_fasta_dict, - format = format, + format = collect_counts_format, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/postprocess/viterbi_segmentation.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/postprocess/viterbi_segmentation.py index 6660f6ef4df..e00c6cd04e9 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/postprocess/viterbi_segmentation.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/postprocess/viterbi_segmentation.py @@ -21,11 +21,8 @@ class ViterbiSegmentationEngine: - """This class runs the forward-backward and Viterbi algorithm on gCNV model/calls shards, obtains - constant copy-number segments, calculates various quality metrics, and saves the result to disk. - - Note: - This class is callable. Upon calling, all samples in the call-set will be processed sequentially. + """This class runs the forward-backward and Viterbi algorithm on gCNV model/calls shards for a single sample, + obtains constant copy-number segments, calculates various quality metrics, and saves the result to disk. Note: It is assumed that the model and calls shards are provided in order according to the SAM sequence dictionary. @@ -35,20 +32,23 @@ def __init__(self, model_shards_paths: List[str], calls_shards_paths: List[str], sample_metadata_collection: SampleMetadataCollection, + sample_index: int, output_path: str): """Initializer. Args: model_shards_paths: list of paths to model shards calls_shards_paths: list of paths to calls shards - sample_metadata_collection: sample metadata collection (must contain all samples) + sample_metadata_collection: sample metadata collection (must contain sample being analyzed) + sample_index: index of the sample in the callset output_path: output path for writing segmentation results """ try: - self._validate_args(model_shards_paths, calls_shards_paths, sample_metadata_collection) + self._validate_args(model_shards_paths, calls_shards_paths, sample_metadata_collection, sample_index) except AssertionError as ex: raise Exception("Inconsistency detected in the provided model and calls shards.") from ex + self.sample_index = sample_index self.output_path = output_path self.calls_shards_paths = calls_shards_paths self.sample_metadata_collection = sample_metadata_collection @@ -69,8 +69,7 @@ def __init__(self, os.path.join(model_shards_paths[0], io_consts.default_interval_list_filename)) # sample names - self.sample_names = self._get_sample_names_from_calls_shard(calls_shards_paths[0]) - self.num_samples = len(self.sample_names) + self.sample_name = self._get_sample_name_from_calls_shard(calls_shards_paths[0], sample_index) # interval list metadata interval_list_metadata: IntervalListMetadata = IntervalListMetadata(self.interval_list) @@ -107,50 +106,23 @@ def __init__(self, self.get_copy_number_hmm_specs = HHMMClassAndCopyNumberBasicCaller\ .get_compiled_copy_number_hmm_specs_theano_func() - def __call__(self): - """Perform Viterbi segmentation for all samples, calculates segment qualities and writes the results - to disk. - """ - io_commons.assert_output_path_writable(self.output_path) - - # write configs and gcnvkernel version to output path - shutil.copy(os.path.join(self.calls_shards_paths[0], io_consts.default_denoising_config_json_filename), - self.output_path) - shutil.copy(os.path.join(self.calls_shards_paths[0], io_consts.default_calling_config_json_filename), - self.output_path) - io_commons.write_gcnvkernel_version(self.output_path) - - # write concatenated interval list to output path - io_intervals_and_counts.write_interval_list_to_tsv_file( - os.path.join(self.output_path, io_consts.default_interval_list_filename), - self.interval_list, - self.interval_list_sam_header_lines) - - for si in range(self.num_samples): - self.write_copy_number_segments_for_single_sample(si) - - def _viterbi_segments_generator_for_single_sample(self, sample_index: int)\ - -> Generator[IntegerCopyNumberSegment, None, None]: + def _viterbi_segments_generator(self) -> Generator[IntegerCopyNumberSegment, None, None]: """Performs Viterbi segmentation and segment quality calculation for a single sample in the call-set and returns a generator for segments. - Args: - sample_index: index of the sample in the collection - Returns: a generator for segments """ - assert 0 <= sample_index < self.num_samples, "Sample index is out of range." # load copy number log emission for the sample copy_number_log_emission_tc_shards = () for calls_path in self.calls_shards_paths: copy_number_log_emission_tc_shards += (self._get_log_copy_number_emission_tc_from_calls_shard( - calls_path, sample_index),) + calls_path, self.sample_index),) copy_number_log_emission_tc = np.concatenate(copy_number_log_emission_tc_shards, axis=0) # iterate over contigs and perform segmentation - sample_name = self.sample_names[sample_index] + sample_name = self.sample_name for contig_index, contig in enumerate(self.ordered_contig_list): _logger.info("Segmenting contig ({0}/{1}) (contig name: {2})...".format( contig_index + 1, len(self.ordered_contig_list), contig)) @@ -228,17 +200,14 @@ def _viterbi_segments_generator_for_single_sample(self, sample_index: int)\ yield segment - def write_copy_number_segments_for_single_sample(self, sample_index: int): + def write_copy_number_segments(self): """Performs Viterbi segmentation and segment quality calculation for a single sample in the call-set and saves the results to disk. - Args: - sample_index: sample index in the call-set """ - assert 0 <= sample_index < self.num_samples, "Sample index is out of range." - sample_name = self.sample_names[sample_index] - _logger.info("Processing sample index: {0}, sample name: {1}...".format(sample_index, sample_name)) - sample_output_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(sample_index)) + sample_name = self.sample_name + _logger.info("Processing sample index: {0}, sample name: {1}...".format(self.sample_index, sample_name)) + sample_output_path = os.path.join(self.output_path, io_consts.sample_folder_prefix + repr(self.sample_index)) io_commons.assert_output_path_writable(sample_output_path, try_creating_output_path=True) # write configs, gcnvkernel version and sample name to output path @@ -262,19 +231,21 @@ def write_copy_number_segments_for_single_sample(self, sample_index: int): of.write(IntegerCopyNumberSegment.get_header_column_string() + '\n') # add segments - for segment in self._viterbi_segments_generator_for_single_sample(sample_index): + for segment in self._viterbi_segments_generator(): of.write(repr(segment) + '\n') @staticmethod def _validate_args(model_shards_paths: List[str], calls_shards_paths: List[str], - sample_metadata_collection: SampleMetadataCollection): + sample_metadata_collection: SampleMetadataCollection, + sample_index: int): assert len(model_shards_paths) > 0, "At least one model shard must be provided." assert len(calls_shards_paths) == len(model_shards_paths),\ "The number of model shards ({0}) and calls shards ({1}) must match.".format( len(model_shards_paths), len(calls_shards_paths)) + assert sample_index >= 0, "Sample index must be an integer non-negative number" - scattered_sample_names: List[Tuple[str]] = [] + scattered_sample_names: List[str] = [] for model_path, calls_path in zip(model_shards_paths, calls_shards_paths): # assert interval lists are identical model_interval_list_file = os.path.join(model_path, io_consts.default_interval_list_filename) @@ -308,7 +279,8 @@ def _validate_args(model_shards_paths: List[str], "proceeding at your own risk!") # extract and store sample names for the current shard - scattered_sample_names.append(ViterbiSegmentationEngine._get_sample_names_from_calls_shard(calls_path)) + scattered_sample_names.append( + ViterbiSegmentationEngine._get_sample_name_from_calls_shard(calls_path, sample_index)) # all scattered calls have the same set of samples and in the same order assert len(set(scattered_sample_names)) == 1,\ @@ -319,19 +291,13 @@ def _validate_args(model_shards_paths: List[str], sample_metadata_collection.all_samples_have_ploidy_metadata(sample_names) @staticmethod - def _get_sample_names_from_calls_shard(calls_path: str) -> Tuple[str]: - sample_names: Tuple[str] = () - sample_index = 0 - while True: - sample_posteriors_path = io_denoising_calling.get_sample_posterior_path(calls_path, sample_index) - if not os.path.isdir(sample_posteriors_path): - break - sample_names += (io_commons.get_sample_name_from_txt_file(sample_posteriors_path),) - sample_index += 1 - if len(sample_names) == 0: - raise Exception("Could not find any sample posterior calls in {0}.".format(calls_path)) - else: - return tuple(sample_names) + def _get_sample_name_from_calls_shard(calls_path: str, sample_index: int) -> str: + sample_posteriors_path = io_denoising_calling.get_sample_posterior_path(calls_path, sample_index) + if not os.path.isdir(sample_posteriors_path): + raise Exception("Could not find any sample posterior calls in {0} for sample with index {1}.". + format(calls_path, sample_index)) + sample_name = io_commons.get_sample_name_from_txt_file(sample_posteriors_path) + return sample_name @staticmethod def _get_denoising_config(input_path: str) -> DenoisingModelConfig: diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py index 49db66f4076..f38631a7c9f 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py @@ -69,5 +69,5 @@ logger.debug("Calls shards path(s): {0}".format(repr(args.model_shards))) viterbi_engine = gcnvkernel.ViterbiSegmentationEngine( - args.model_shards, args.calls_shards, sample_metadata_collection, args.output_path) - viterbi_engine.write_copy_number_segments_for_single_sample(args.sample_index) + args.model_shards, args.calls_shards, sample_metadata_collection, args.sample_index, args.output_path) + viterbi_engine.write_copy_number_segments() diff --git a/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list b/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list index 99384de09ae..ceefea343a4 100755 --- a/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list +++ b/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28cf6de883c0ce48de84df15e2cb65758658fcb801ddd1a705562b481563bd69 -size 8916 +oid sha256:19dbffe91b7662dc8210f74568a6ea3e44bb40f3902b00f35af6b089522fdb16 +size 2120 diff --git a/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz b/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz index 51c3e7373b8..866bd3914f3 100644 --- a/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz +++ b/src/test/resources/large/cnv_germline_workflows_test_files/wes-contig-ploidy-model.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b18af3fa8aeed18c517c9aec2fac5cba80f948f8a50f1c50172621bdb7ff1c8f -size 2187 +oid sha256:78190124c8d8e344d92d4530846641664f556eacd48584f29f8e3722a6833622 +size 1244 diff --git a/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz b/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz index 211ef2f000d..df120d29127 100644 --- a/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz +++ b/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0c811739714898a6a3e848cd32a07a3688ef9c03f7414095daa6b45ab75ead1 -size 19238 +oid sha256:b258ab287c70c063a0bfc28bd18210e44b7551269b09a525656e97cd5c6530e4 +size 6771 diff --git a/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz b/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz index 84899b69eea..9d7204b1e22 100644 --- a/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz +++ b/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c8396faea3e3852e820f580d9e9e14ad3c4ee91711f215f090def5addaf1696 -size 11727 +oid sha256:26169b480ec60ff7ac4ad5d0f8fe2c396a4eb20a6e06dff71ac96a5c678a12b0 +size 3336