-
Notifications
You must be signed in to change notification settings - Fork 596
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
hopefully what is needed for docker building proper
- Loading branch information
1 parent
8d66d26
commit 5914af0
Showing
21 changed files
with
2,715 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,276 @@ | ||
version 1.0 | ||
|
||
# run Mutect2 to get both training AND test datasets. The training dataset is preprocessed and combined with | ||
# high-quality labeled data to make a UDA dataset, then used to train an artifact model. The test dataset is used | ||
# for the posterior model and filtering. | ||
# note that the artifact model can be trained before the Mutect2 workflow runs FilterMutectCalls | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:mutect2/versions/18/plain-WDL/descriptor" as m2 | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-uda-dataset/versions/3/plain-WDL/descriptor" as uda | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-train-artifact-model/versions/13/plain-WDL/descriptor" as training | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-call-variants/versions/19/plain-WDL/descriptor" as calling | ||
|
||
workflow CallVariantsWithUDA { | ||
input { | ||
# basic inputs for Mutect2 | ||
File? intervals | ||
File? masked_intervals | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File primary_bam | ||
File primary_bai | ||
File? control_bam | ||
File? control_bai | ||
File? gnomad | ||
File? gnomad_idx | ||
String? m2_extra_args | ||
File? dragstr_model | ||
Boolean make_bamout = false | ||
Boolean compress_vcfs = false | ||
|
||
# Mutect2 filtering | ||
Boolean skip_m2_filtering | ||
File? variants_for_contamination | ||
File? variants_for_contamination_idx | ||
File? realignment_index_bundle | ||
String? realignment_extra_args | ||
Boolean? run_orientation_bias_mixture_model_filter | ||
|
||
# preprocessing arguments | ||
Int chunk_size | ||
|
||
# training arguments for both artifact model and posterior model | ||
Int batch_size | ||
Int inference_batch_size | ||
Int num_workers | ||
Int? gpu_count | ||
Int? training_mem | ||
|
||
# UDA training arguments | ||
File base_model | ||
File source_train_tar | ||
String source_edit_type = "keep_everything" | ||
String target_edit_type = "unlabel_everything" | ||
Int num_epochs | ||
Int num_calibration_epochs | ||
Float dropout_p | ||
Array[Int] aggregation_layers | ||
Array[Int] calibration_layers | ||
String? training_extra_args | ||
Boolean learn_artifact_spectra | ||
Float? genomic_span | ||
|
||
# Permutect filtering / posterior model | ||
File? test_dataset_truth_vcf # used for evaluation | ||
File? test_dataset_truth_vcf_idx | ||
Int? num_spectrum_iterations | ||
Float? spectrum_learning_rate | ||
String? permutect_filtering_extra_args | ||
String bcftools_docker = "us.gcr.io/broad-dsde-methods/davidben/bcftools" | ||
File? obscene_hack_leave_unset | ||
|
||
|
||
# runtime | ||
String gatk_docker | ||
String permutect_docker | ||
File? gatk_override | ||
String basic_bash_docker = "ubuntu:16.04" | ||
Int scatter_count | ||
Int preemptible = 2 | ||
Int max_retries = 1 | ||
Int small_task_cpu = 2 | ||
Int small_task_mem = 4 | ||
Int small_task_disk = 100 | ||
Int boot_disk_size = 12 | ||
Int learn_read_orientation_mem = 8000 | ||
Int filter_alignment_artifacts_mem = 9000 | ||
String? gcs_project_for_requester_pays | ||
|
||
# Use as a last resort to increase the disk given to every task in case of ill behaving data | ||
Int emergency_extra_disk = 0 | ||
} | ||
|
||
# note: we make both training and test datasets | ||
# note: for speed we may skip filtering in order to begin UDA artifact model training immediately | ||
# the only M2 filtering we may need is contamination, and that may be skipped | ||
call m2.Mutect2 { | ||
input: | ||
intervals = intervals, | ||
masked_intervals = masked_intervals, | ||
ref_fasta = ref_fasta, | ||
ref_fai = ref_fai, | ||
ref_dict = ref_dict, | ||
tumor_reads = primary_bam, | ||
tumor_reads_index = primary_bai, | ||
normal_reads = control_bam, | ||
normal_reads_index = control_bai, | ||
gnomad = gnomad, | ||
gnomad_idx = gnomad_idx, | ||
variants_for_contamination = variants_for_contamination, | ||
variants_for_contamination_idx = variants_for_contamination_idx, | ||
realignment_index_bundle = realignment_index_bundle, | ||
realignment_extra_args = realignment_extra_args, | ||
run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter, | ||
m2_extra_args = m2_extra_args, | ||
dragstr_model = dragstr_model, | ||
make_bamout = make_bamout, | ||
make_permutect_training_dataset = true, | ||
make_permutect_test_dataset = true, | ||
permutect_test_dataset_truth_vcf = test_dataset_truth_vcf, | ||
permutect_test_dataset_truth_vcf_idx = test_dataset_truth_vcf_idx, | ||
skip_filtering = skip_m2_filtering, | ||
gatk_docker = gatk_docker, | ||
gatk_override = gatk_override, | ||
scatter_count = scatter_count, | ||
preemptible = preemptible, | ||
max_retries = max_retries, | ||
small_task_cpu = small_task_cpu, | ||
small_task_mem = small_task_mem, | ||
small_task_disk = small_task_disk, | ||
boot_disk_size = boot_disk_size, | ||
gcs_project_for_requester_pays = gcs_project_for_requester_pays, | ||
emergency_extra_disk = emergency_extra_disk | ||
} | ||
|
||
# preprocess the training data from Mutect2 | ||
call Preprocess { | ||
input: | ||
training_dataset = select_first([Mutect2.permutect_training_dataset]), | ||
chunk_size = chunk_size, | ||
permutect_docker = permutect_docker | ||
} | ||
|
||
# combine the source_tar and preprocessed training data into a UDA dataset | ||
call uda.PermutectUDADataset { | ||
input: | ||
source_train_tar = source_train_tar, | ||
target_train_tar = Preprocess.train_tar, | ||
source_edit_type = source_edit_type, | ||
target_edit_type = target_edit_type, | ||
chunk_size = chunk_size, | ||
permutect_docker = permutect_docker, | ||
preemptible = 0, | ||
max_retries = 0 | ||
} | ||
|
||
# train an artifact model on the UDA dataset | ||
call training.TrainPermutect { | ||
input: | ||
train_tar = PermutectUDADataset.uda_train_tar, | ||
base_model = base_model, | ||
num_epochs = num_epochs, | ||
num_calibration_epochs = num_calibration_epochs, | ||
batch_size = batch_size, | ||
inference_batch_size = inference_batch_size, | ||
num_workers = num_workers, | ||
mem = training_mem, | ||
gpu_count = gpu_count, | ||
dropout_p = dropout_p, | ||
aggregation_layers = aggregation_layers, | ||
calibration_layers = calibration_layers, | ||
extra_args = training_extra_args, | ||
learn_artifact_spectra = learn_artifact_spectra, | ||
genomic_span = genomic_span, | ||
permutect_docker = permutect_docker, | ||
preemptible = 0, | ||
max_retries = 0 | ||
} | ||
|
||
# we already ran M2 so we don't need the entire calling workflow, just the post-M2 parts of it | ||
call calling.SplitMultiallelics { | ||
input: | ||
input_vcf = Mutect2.output_vcf, | ||
input_vcf_idx = Mutect2.output_vcf_idx, | ||
ref_fasta = ref_fasta, | ||
ref_fai = ref_fai, | ||
ref_dict = ref_dict, | ||
bcftools_docker = bcftools_docker | ||
} | ||
|
||
call calling.IndexVCF as IndexAfterSplitting { | ||
input: | ||
unindexed_vcf = SplitMultiallelics.output_vcf, | ||
gatk_docker = gatk_docker | ||
} | ||
|
||
call calling.PermutectFiltering { | ||
input: | ||
mutect2_vcf = IndexAfterSplitting.vcf, | ||
mutect2_vcf_idx = IndexAfterSplitting.vcf_index, | ||
permutect_model = TrainPermutect.artifact_model, | ||
test_dataset = select_first([Mutect2.permutect_test_dataset]), | ||
contigs_table = Mutect2.permutect_contigs_table, | ||
maf_segments = Mutect2.maf_segments, | ||
mutect_stats = Mutect2.mutect_stats, | ||
batch_size = batch_size, | ||
num_workers = num_workers, | ||
gpu_count = gpu_count, | ||
num_spectrum_iterations = num_spectrum_iterations, | ||
spectrum_learning_rate = spectrum_learning_rate, | ||
chunk_size = chunk_size, | ||
permutect_filtering_extra_args = permutect_filtering_extra_args, | ||
permutect_docker = permutect_docker, | ||
} | ||
|
||
|
||
call calling.IndexVCF as IndexAfterFiltering { | ||
input: | ||
unindexed_vcf = PermutectFiltering.output_vcf, | ||
gatk_docker = gatk_docker | ||
} | ||
|
||
output { | ||
File? bamout = Mutect2.bamout | ||
File? bamout_index = Mutect2.bamout_index | ||
File mutect_stats = Mutect2.mutect_stats | ||
File permutect_contigs_table = Mutect2.permutect_contigs_table | ||
File permutect_read_groups_table = Mutect2.permutect_read_groups_table | ||
File train_tar = Preprocess.train_tar | ||
File training_tensorboard_tar = TrainPermutect.training_tensorboard_tar | ||
File output_vcf = IndexAfterFiltering.vcf | ||
File output_vcf_idx = IndexAfterFiltering.vcf_index | ||
File calling_tensorboard_tar = PermutectFiltering.tensorboard_report | ||
} | ||
|
||
} | ||
|
||
task Preprocess { | ||
input { | ||
File training_dataset | ||
Int chunk_size | ||
Int? source_label | ||
|
||
String permutect_docker | ||
Int? preemptible | ||
Int? max_retries | ||
Int? disk_space | ||
Int? cpu | ||
Int? mem | ||
} | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem * 1000 else 16000 | ||
Int command_mem = machine_mem - 500 | ||
|
||
command <<< | ||
set -e | ||
|
||
preprocess_dataset --training_datasets ~{training_dataset} --chunk_size ~{chunk_size} ~{"--sources " + source_label} --output train.tar | ||
>>> | ||
|
||
runtime { | ||
docker: permutect_docker | ||
bootDiskSizeGb: 12 | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + select_first([disk_space, 100]) + " SSD" | ||
preemptible: select_first([preemptible, 2]) | ||
maxRetries: select_first([max_retries, 0]) | ||
cpu: select_first([cpu, 1]) | ||
} | ||
|
||
output { | ||
File train_tar = "train.tar" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
version 1.0 | ||
|
||
## NOTE: this is essentially copied from https://github.com/broadinstitute/warp/blob/develop/tasks/broad/DragenTasks.wdl | ||
## with minor modifications | ||
workflow DragstrCalibration { | ||
input { | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File reads | ||
File reads_index | ||
} | ||
|
||
call CalibrateDragstrModel { | ||
input: | ||
ref_fasta = ref_fasta, | ||
ref_fai = ref_fai, | ||
ref_dict = ref_dict, | ||
reads = reads, | ||
reads_index = reads_index | ||
} | ||
|
||
output { | ||
File dragstr_model = CalibrateDragstrModel.dragstr_model | ||
} | ||
} | ||
|
||
task CalibrateDragstrModel { | ||
input { | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File reads | ||
File reads_index | ||
|
||
String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" | ||
Int preemptible_tries = 1 | ||
Int threads = 4 | ||
Int? disk_space | ||
Int? mem | ||
} | ||
|
||
String parallel_args = "--threads " + threads | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem * 1000 else 4000 | ||
Int command_mem = machine_mem - 500 | ||
|
||
parameter_meta{ | ||
ref_fasta: {localization_optional: true} | ||
ref_fai: {localization_optional: true} | ||
ref_dict: {localization_optional: true} | ||
reads: {localization_optional: true} | ||
reads_index: {localization_optional: true} | ||
} | ||
|
||
command <<< | ||
set -x | ||
|
||
gatk ComposeSTRTableFile \ | ||
-R ~{ref_fasta} \ | ||
-O str_table.tsv | ||
|
||
|
||
gatk --java-options "-Xmx~{command_mem}m" CalibrateDragstrModel \ | ||
-R ~{ref_fasta} \ | ||
-I ~{reads} \ | ||
-str str_table.tsv \ | ||
-O params.dragstr \ | ||
~{parallel_args} | ||
>>> | ||
|
||
runtime { | ||
docker: docker | ||
disks: "local-disk " + select_first([disk_space, 100]) + " SSD" | ||
memory: machine_mem + " MB" | ||
preemptible: preemptible_tries | ||
cpu: threads | ||
} | ||
|
||
output { | ||
File dragstr_model = "params.dragstr" | ||
} | ||
} |
Oops, something went wrong.