Updates to generate v15 CI files (#575)

* Update to accommodate v15 * Add biospecimen.RDS Co-authored-by: Candace Savonen <cansav09@gmail.com> Co-authored-by: jashapiro <jashapiro@gmail.com>
AlexsLemonade · Mar 2, 2020 · 988f12d · 988f12d
1 parent 661f644
commit 988f12d
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 4 deletions.
diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R
@@ -16,7 +16,7 @@
 #     consideration. This number will be 10% of num_matched.
 #   - We include (and hardcode) a set of biospecimen IDs for samples that have
 #     TP53 and NF1 mutations that meet the criteria in the tp53_nf1_module and
-#     are represented in the stranded RNA-seq dataset. 
+#     are represented in the stranded RNA-seq dataset.
 #     See 00-enrich-positive-examples for more information.
 #
 # EXAMPLE USAGE:
@@ -73,6 +73,9 @@ get_biospecimen_ids <- function(filename, id_mapping_df) {
     } else {
       biospecimen_ids <- unique(cnv_file$ID)
     }
+  } else if (grepl("consensus_seg_annotated", filename)) {
+    annotated_cn_file <- read_tsv(filename)
+    biospecimen_ids <- unique(annotated_cn_file$biospecimen_id)
   } else if (grepl("pbta-fusion", filename)) {
     fusion_file <- read_tsv(filename)
     # the biospecimen IDs in the filtered/prioritize fusion list included with
@@ -127,7 +130,7 @@ option_list <- list(
   make_option(
     c("-r", "--supported_string"),
     type = "character",
-    default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|cnv_consensus",
+    default = "pbta-snv|pbta-cnv|pbta-fusion|pbta-isoform|pbta-sv|pbta-gene|consensus_seg_annotated",
     help = "string for pattern matching used to subset to only supported files"
   ),
   make_option(

diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R
@@ -96,7 +96,11 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
     cnv_file %>%
       dplyr::filter(!!rlang::sym(biospecimen_column) %in% biospecimen_ids) %>%
       readr::write_tsv(output_file)
-
+  } else if (grepl("consensus_seg_annotated", filename)) {
+    annotated_cn_file <- readr::read_tsv(filename)
+    annotated_cn_file %>%
+      dplyr::filter(biospecimen_id %in% biospecimen_ids) %>%
+      readr::write_tsv(output_file)
   } else if (grepl("pbta-fusion", filename)) {
     # original files contain the biospecimen IDs in a column called 'tumor_id',
     # the filtered/prioritized list biospecimen IDs are in 'Sample'

diff --git a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS
diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh
@@ -7,7 +7,7 @@ set -o pipefail
 
 # Set defaults for release and biospecimen file name
 BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS}
-RELEASE=${RELEASE:-release-v14-20200203}
+RELEASE=${RELEASE:-release-v15-20200228}
 NUM_MATCHED=${NUM_MATCHED:-15}
 
 # This option controls whether or not the two larger MAF files are skipped as