diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R index a75a39da32..a4b1195330 100644 --- a/analyses/create-subset-files/01-get_biospecimen_identifiers.R +++ b/analyses/create-subset-files/01-get_biospecimen_identifiers.R @@ -16,7 +16,7 @@ # consideration. This number will be 10% of num_matched. # - We include (and hardcode) a set of biospecimen IDs for samples that have # TP53 and NF1 mutations that meet the criteria in the tp53_nf1_module and -# are represented in the stranded RNA-seq dataset. +# are represented in the stranded RNA-seq dataset. # See 00-enrich-positive-examples for more information. # # EXAMPLE USAGE: @@ -73,6 +73,9 @@ get_biospecimen_ids <- function(filename, id_mapping_df) { } else { biospecimen_ids <- unique(cnv_file$ID) } + } else if (grepl("consensus_seg_annotated", filename)) { + annotated_cn_file <- read_tsv(filename) + biospecimen_ids <- unique(annotated_cn_file$biospecimen_id) } else if (grepl("pbta-fusion", filename)) { fusion_file <- read_tsv(filename) # the biospecimen IDs in the filtered/prioritize fusion list included with @@ -127,7 +130,7 @@ option_list <- list( make_option( c("-r", "--supported_string"), type = "character", - default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|cnv_consensus", + default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|consensus_seg_annotated", help = "string for pattern matching used to subset to only supported files" ), make_option( diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R index 1b2ae1d8e7..759cf6cf68 100644 --- a/analyses/create-subset-files/02-subset_files.R +++ b/analyses/create-subset-files/02-subset_files.R @@ -96,7 +96,11 @@ subset_files <- function(filename, biospecimen_ids, output_directory) { cnv_file %>% dplyr::filter(!!rlang::sym(biospecimen_column) %in% biospecimen_ids) %>% readr::write_tsv(output_file) - + } else if (grepl("consensus_seg_annotated", filename)) { + annotated_cn_file <- readr::read_tsv(filename) + annotated_cn_file %>% + dplyr::filter(biospecimen_id %in% biospecimen_ids) %>% + readr::write_tsv(output_file) } else if (grepl("pbta-fusion", filename)) { # original files contain the biospecimen IDs in a column called 'tumor_id', # the filtered/prioritized list biospecimen IDs are in 'Sample' diff --git a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS index a7bae9e69f..2849d52a0f 100644 Binary files a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS and b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS differ diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh index 06e21c2e14..d169282e60 100755 --- a/analyses/create-subset-files/create_subset_files.sh +++ b/analyses/create-subset-files/create_subset_files.sh @@ -7,7 +7,7 @@ set -o pipefail # Set defaults for release and biospecimen file name BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS} -RELEASE=${RELEASE:-release-v14-20200203} +RELEASE=${RELEASE:-release-v15-20200228} NUM_MATCHED=${NUM_MATCHED:-15} # This option controls whether or not the two larger MAF files are skipped as