diff --git a/.circleci/config.yml b/.circleci/config.yml index 93ebc0f45a..3d74a030ad 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -122,7 +122,7 @@ jobs: - run: name: Gene set enrichment analysis to generate GSVA scores - command: ./scripts/run_in_ci.sh bash "analyses/gene-set-enrichment-analysis/run-gsea.sh" + command: OPENPBTA_TESTING=1 ./scripts/run_in_ci.sh bash "analyses/gene-set-enrichment-analysis/run-gsea.sh" ################################ diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index 7d0cd136df..f19d088683 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -1,13 +1,14 @@ ## Define the ending file(s) that we want -OUTPUT= expand("../../scratch/interim/{sample}.{caller}.{dupdel}.filtered3.bed", +OUTPUT= expand("../../scratch/interim/{sample}.{combined_caller}.{dupdel}.bed", sample=config["samples"], - caller=["freec", "cnvkit", "manta"], - dupdel=["dup", "del"]) + combined_caller=["manta_cnvkit", "manta_freec", "cnvkit_freec"], + dupdel=["dup", "del"]), ## Define the wildcards to use in the file names. wildcard_constraints: caller = "cnvkit|freec|manta", - dupdel = "dup|del" + dupdel = "dup|del", + combined_caller = "manta_cnvkit|manta_freec|cnvkit_freec" ## Define the first rule of the Snakefile. This rule determines what the final file is and which steps to be taken. rule all: @@ -136,3 +137,28 @@ rule restructure_column: threads: 1 shell: "python3 {input.script} --file {input.merged_bed} > {output.restructured_bed}" + + +rule compare_cnv_methods: + input: + ## Define the location of the input file and take the path/extension from the config file + script=os.path.join(config["scripts"], "compare_variant_calling_updated.py"), + cnvkit="../../scratch/interim/{sample}.cnvkit.{dupdel}.filtered3.bed", + freec="../../scratch/interim/{sample}.freec.{dupdel}.filtered3.bed", + manta="../../scratch/interim/{sample}.manta.{dupdel}.filtered3.bed" + + output: + ## Define the output files' names + manta_cnvkit="../../scratch/interim/{sample}.manta_cnvkit.{dupdel}.bed", + manta_freec="../../scratch/interim/{sample}.manta_freec.{dupdel}.bed", + cnvkit_freec="../../scratch/interim/{sample}.cnvkit_freec.{dupdel}.bed" + threads: 1 + params: + sample_name="{sample}" + shell: + "python3 {input.script} --manta {input.manta} --cnvkit {input.cnvkit} --freec {input.freec} " + "--manta_cnvkit {output.manta_cnvkit} --manta_freec {output.manta_freec} --cnvkit_freec {output.cnvkit_freec} " + "--sample {params.sample_name}" + + + diff --git a/analyses/copy_number_consensus_call/src/scripts/compare_variant_calling_updated.py b/analyses/copy_number_consensus_call/src/scripts/compare_variant_calling_updated.py new file mode 100644 index 0000000000..755671108b --- /dev/null +++ b/analyses/copy_number_consensus_call/src/scripts/compare_variant_calling_updated.py @@ -0,0 +1,319 @@ +################## PURPOSE ################# + +# This script is to do PAIR comparisons using the three 3 caller CNVs files +# This script takes in the 3 caller CNVs files and put their content in to a nested dictionary. +# There are 3 dictionaries within the bigger dictionary , each dictionary is the content of a singular caller method. + +# For-loops are used to loop through these 3 files to do pair comparison of the 3 files. +# We compare manta - cnvkit +# then manta - freec +# and finally cnvkit - freec +# We'll end up with 3 output files. + +# The principle for finding consensus CNVs is that if any two CNVs overlap at least 50% reciprocally, we will +# take the common section of those overlapping CNVs as the new consensus CNV. + +################# Output format ############### + +# column1 column2 column3 column4 column5 column6 column7 column8 column9 +# +# chr# consensus consensus raw manta coordinates raw cnvkit coordinates raw freec coordinates cnv type sample filename +# start_pos end_pos that made up this CNV that made up this CNV that made up this CNV name + +################# ASSUMPTION ############### + +# None of the files have overlapping segments within its own file. + +############################################ + + + + +# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports +# Standard library +import argparse +import itertools +import os +import re +import sys + + +# Related third party +import numpy as np +import pandas as pd + + + +########################## DEFINE FUNCTIONS ############################### + + +def read_input_file(input_file): + """ Function to read content of input files and output to a dictionary""" + + ## Check to see if the CNVs bed file is empty, if so, make an empty variable + if os.stat(input_file).st_size == 0: + content_dict = {} + + ## If the file has content, then load in the content + else: + with open(input_file) as file: + + ## Strip the trailing spaces for each line + stripped_content = [line.rstrip() for line in file.readlines()] + + ## Split each line up by any white space between columns + fin_input_content = [re.split('\s+',line) for line in stripped_content] + + ## Create a list of unique keys/chromosome + dict_key = [] + [dict_key.append(i[0]) for i in fin_input_content] + dict_key_unique = np.unique(dict_key) + + ## Make the dictionary + content_dict = {el:[] for el in dict_key_unique} + + ## Populate the dictionary + {content_dict[line[0]].append(line[1:]) for line in fin_input_content} + + return content_dict + + +def generate_consensus(list1_name,list1,list2_name,list2): + """ Function to take 2 lists of CNVs as inputs, create consensus CNVs, + and output a list of consensus CNVs + + Keyword arguments: + list1_name -- Name of the 1st CNVs file + list1 -- The actual 1st CNVs file + list2_name -- Name of the 2nd CNVs file + list2 -- The actual 2nd CNVs file + """ + + + ## Define a variable to store the list of consensus CNVs + consensus_list = [] + + ## For every CNVs of list1, we look through all CNVs in list2 that overlaps and perform actions + ## The use of dictionary is to speed this process up.\ + ## Loop through all chromosomes of list1 + for chr_list1 in list1: + + ## For every CNVs in that chromosome, + for m in list1[chr_list1]: + + ## Assign the start_pos, end_pos, and copy number of list1 to variables + start_list1 = int(m[0]) + end_list1 = int(m[1]) + cnv_list1 = m[2] + + ## Make sure end > start + if start_list1 > end_list1: + start_list1, end_list1 = end_list1, start_list1 + + ## Assign variable for the CULMULATIVE coverage of the current CNV from list1 + coverage_list1 = 0 + + ## Assign variables to store information for list2 + list2_overlap_len = 0 + list2_total_len = 0 + list2_start_list = [] + list2_end_list = [] + list2_chr_str_end = '' + + + ## For every of the CNV in list one, we look over ALL CNVs in list 2 and make comparisons + if chr_list1 in list2: + for n in list2[chr_list1]: + + ## Assign start_pos, end_pos, and copy number of list2 to variables + start_list2 = int(n[0]) + end_list2 = int(n[1]) + cnv_list2 = n[2] + + ## Make sure end > start + if start_list2 > end_list2: + start_list2, end_list2 = end_list2, start_list2 + + ## For the current CNV in list1, if it overlaps with any CNV from list2 + if start_list1 <= end_list2 and end_list1 >= start_list2: + + ## Take the common region and store them in variables + start = max(start_list1,start_list2) + end = min(end_list1,end_list2) + + ## For list1's CNV, + ## if there are any overlaps at all - immediately add the coverage into the overall coverage for that specific CNV of list1 + coverage_list1 += (end - start + 1) / (end_list1 - start_list1 + 1) + + ## For list2's CNV + ## If the overlapping covers >= 50% of its length, + ## then we add in the start, end coordinate, total overlap length, and total len to different lists + ## This is done to account for 1 CNV from list1 overlapping with MULTIPLE CNVs from list2 + if (end - start +1) / (end_list2 - start_list2 + 1) >= 0.5: + + ## Add the overlapping length to a list of overlap length + list2_overlap_len += (end - start + 1) + + ## Add the total length to a list of total length + list2_total_len += (end_list2 - start_list2 + 1) + + ## Add start_pos and end_pos to a list to choose from later on + list2_start_list += [start_list2] + list2_end_list += [end_list2] + + ## Add the raw coordinates to a growing list that gets output later on to the final consensus file + list2_chr_str_end += str(cnv_list2) + + + + + ## The coverage of list1 is already calculated as the "coverage_list1" variable + ## ASSUMING there is no overlaping segments within 1 method + ## then it is safe to add/accumulate the coverage_list1 value. Refer to assumptions (line 23) + + ## Next we calculate the coverage of list2 since there might be more than 1 overlapping CNV from list2 + if list2_total_len > 0: + coverage_list2 = list2_overlap_len / list2_total_len + else: + coverage_list2 = 0 + + ## Check to see if the overlap is >= 50% RECIPROCALLY + if coverage_list1 >= 0.5 and coverage_list2 >= 0.5: + ## if it is 50% reciprocally, we chose from the list2 starts and ends + ## we take out the smallest start and biggest end + ## This maximize the CNV length from list2 + fin_start_list2 = min(list2_start_list) + fin_end_list2 = max(list2_end_list) + + ## Taking only the common region as the final segment. + ## Compare between list2 vs list1 start and end - take the common segment + chrom_start = max(fin_start_list2,start_list1) + chrom_end = min(fin_end_list2,end_list1) + + ## Format the output consensus CNV + ## Column 4 is manta's raw coordinates + ## Column 5 is cnvkit's raw coordinates + ## Column 6 is freec's raw coordinates + + ## if the files input are MANTA and CNVKIT, put info in column 4 and 5, 6th column is null + if (list1_name == 'manta' and list2_name == 'cnvkit') or (list1_name == 'cnvkit' and list2_name == 'manta'): + overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),str(cnv_list1).strip(','),list2_chr_str_end.strip(','),'NULL',m[-1]] + + ## if the files input are MANTA and FREEC, put info in column 4 and 6, 5th column is null + elif (list1_name == 'manta' and list2_name == 'freec') or (list1_name == 'freec' and list2_name == 'manta'): + overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),str(cnv_list1).strip(','),'NULL',list2_chr_str_end.strip(','),m[-1]] + + ## if the files input are CNVKIT and FREEC, put info in column 5 and 6, 4th column is null + elif (list1_name == 'cnvkit' and list2_name == 'freec') or (list1_name == 'freec' and list2_name == 'cnvkit'): + overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),'NULL',str(cnv_list1).strip(','),list2_chr_str_end.strip(','),m[-1]] + + ## Add the formatted consensus CNV into a growing list of consensus CNVs + consensus_list = consensus_list + [overlap_chrom] + + return consensus_list + + + +def save_to_file(output_file_content, output_path, sample_name): + """ Function that takes in content, output file path, and sample name to save the content to a file1 + + Keyword arguments: + output_file_content -- content of consensus + output_path -- path and name of the output file + sample_name -- sample name of the consensus CNVs. + """ + + ## Open up the file to write in it, the 'w' option overwrites the file if the file exists. + with open( output_path, 'w') as file: + + ## Add the sample name and file name to each line + single_name = os.path.basename(output_path) + + ## Loop through the output file and print line by line + for k in output_file_content: + + ## Join the sample_name and single_name(file name) to the CNV info + k.extend([sample_name, single_name,'\n']) + file.write('\t'.join(k)) + + sys.stderr.write('$$$ Write to file ' + str(output_path) + ' was sucessful\n') + + + +################### SCRIPT BODY ################################# + + + +## Define parser for the input file +parser = argparse.ArgumentParser(description="""This script takes in 3 bed files, each from one of the 3 callers + and find common CNVs between two files at a time.""") +parser.add_argument('--manta', required=True, + help='path to the manta file') +parser.add_argument('--cnvkit', required=True, + help='path to the cnvkit file') +parser.add_argument('--freec', required=True, + help='path to the freec file') +parser.add_argument('--manta_cnvkit', required=True, + help='path of the output consensus between manta and cnvkit') +parser.add_argument('--manta_freec', required=True, + help='path of the output consensus between manta and freec') +parser.add_argument('--cnvkit_freec', required=True, + help='path of the output consensus between cnvkit and freec') +parser.add_argument('--sample', default='no_sample', + help='sample name to use in the file') + +args = parser.parse_args() + + +## Define a list of input callers + +input_callers = ['manta', 'cnvkit', 'freec'] +input_content = dict() +for caller in input_callers: + ## Read in the input files as dictionaries + input_content[caller] = read_input_file(getattr(args, caller)) + + + + + +## Put the output file paths into their own lists that is to be iterated over +## This order is important +## Make a list for pairs of callers to make consensus for +caller_pairs = list(itertools.combinations(input_callers,2)) +# generate output list in the same order +output_files = [getattr(args, "_".join(callers)) for callers in caller_pairs] + +## Make a list for pairs of callers to make consensus for + + +## We have 3 items in input_content. Each item is the content of an input file +## Make a list to store the final files +fin_list =[] + +## Loop through list_index +for caller1, caller2 in caller_pairs: + + ## Compare 1st and 2nd file from list_of_input_contents + ## Then compare the 1st and 3rd + ## Then compare the 2nd and 3rd + list1 = input_content[caller1] + list2 = input_content[caller2] + + + ## Call the "generate_consensus" function to get consensus CNVs + consensus_list = generate_consensus(caller1,list1,caller2,list2) + + + + ## Add the consensus list into a final list that hold the + ## content of 3 files - manta_cnvkit , manta_freec, and cnvkit_freec + fin_list.append(consensus_list) + + + +## Print the output into files +for content, outfile in zip(fin_list, output_files): + + ## Call the "save_to_file" function to print each consensus content to a file. + save_to_file(content, outfile, args.sample) diff --git a/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.Rmd b/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.Rmd new file mode 100644 index 0000000000..255f3f4081 --- /dev/null +++ b/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.Rmd @@ -0,0 +1,243 @@ +--- +title: "GSVA Score Modeling and Exploratory Analysis" +author: "Stephanie J. Spielman for ALSF CCDL" +date: '2020' +output: + html_document: + df_print: paged + toc: yes + html_notebook: + df_print: paged + toc: yes + toc_float: yes +params: + plot_ci: yes + is_ci: FALSE +--- + +### Purpose + +Models and explores GSVA scores. **This code is NOT completed and should be regarded as a work _in progress_.** + +### Usage + +To run this from the command line, use: +``` +Rscript -e "rmarkdown::render('analyses/gene-set-enrichment-analysis/02-exploratory-gsea.Rmd', clean = TRUE)" +``` +_This assumes you are in the top directory of the repository._ + +### Setup + +Load libraries and define certain constants: + +```{r, lib-load, warning=FALSE, message=FALSE} +###TODO: Should we just load tidyverse? All the core are used +library(tidyverse) +#library(dplyr) +#library(tidyr) +#library(readr) +#library(tibble) +#library(purrr) +#library(stringr) +#library(forcats) +#library(ggplot2) +library(broom) + + +# Magrittr pipe +`%>%` <- dplyr::`%>%` + +# Significance testing universal threshold +SIGNIFICANCE_THRESHOLD <- 0.01 + +# Are we testing? +if( !(params$is_ci %in% c(0,1)) & !(params$is_ci %in% c(TRUE, FALSE)) ){ + stop("\n\nERROR: The parameter `is_ci` should be 0/1 (or FALSE/TRUE).") +} +# Assigning params$is_ci to running_in_ci avoids a locked binding error +running_in_ci <- params$is_ci +if (running_in_ci == 0) running_in_ci <- FALSE +if (running_in_ci == 1) running_in_ci <- TRUE +``` + + +
+Next, define directories and load data files: +```{r, data-load, message=FALSE, warning=FALSE} +### Define directories +data_dir <- file.path("..", "..", "data") +results_dir <- "results" + +######### Define input files +## Metadata file (histologies/clinical data) +metadata_file <- file.path(data_dir, "pbta-histologies.tsv") + +## GSEA scores +scores_stranded_file <- file.path(results_dir, "gsva_scores_stranded.tsv") +scores_polya_file <- file.path(results_dir, "gsva_scores_polya.tsv") + +######## Define output files +#PENDING + + +######## Load input files +metadata <- readr::read_tsv(metadata_file) +scores_stranded <- readr::read_tsv(scores_stranded_file) +scores_polya <- readr::read_tsv(scores_polya_file) +``` + + + +### Begin modeling +```{r aov-tukey-gsea-hallmark} + +### Merge histology metadata with each set of gsea scores +scores_stranded <- scores_stranded %>% mutate(data_type = "stranded") +scores_polya <- scores_polya %>% mutate(data_type = "polya") +all_scores <- bind_rows(scores_stranded, scores_polya) %>% + mutate(data_type = factor(data_type), + hallmark_name = factor(hallmark_name)) + +metadata_with_gsva <- metadata %>% + filter(experimental_strategy == "RNA-Seq") %>% + inner_join(all_scores, by = "Kids_First_Biospecimen_ID" ) + +### If we are running in CI, we need to ensure enough levels for AOV. +### PolyA data generally does NOT have enough samples, so this should do it. +if (running_in_ci) +{ + metadata_with_gsva %>% + filter(data_type == "stranded") -> metadata_with_gsva +} + + + +### Defines a function for performing Anova/Tukey for identifying significant pathways +## TODO: Increase flexibility so that predictor does NOT NEED TO BE short_histology +gsva_histology_anova_tukey <- function(df) +{ + aov_fit <- aov(gsea_score ~ short_histology, data = df) + TukeyHSD(aov_fit) %>% + broom::tidy() %>% + dplyr::select(comparison, estimate, adj.p.value) %>% + dplyr::rename(pathway_score_difference = estimate, + tukey_p_value = adj.p.value) +} + + +## Conduct series of ANOVA and posthoc Tukey tests for assessing which short histology groups show differences within EACH pathway (hallmark) +number_of_tests <- metadata_with_gsva %>% + dplyr::select(hallmark_name, data_type) %>% + distinct() %>% + nrow() + +pathway_diffs <- metadata_with_gsva %>% + group_by(hallmark_name, data_type) %>% + nest() %>% + mutate(anova_tukey = map(data, gsva_histology_anova_tukey)) %>% + dplyr::select(-data) %>% + unnest(anova_tukey) %>% + ungroup() %>% + mutate(bonferroni_pvalue = tukey_p_value * number_of_tests, + bonferroni_pvalue = ifelse(bonferroni_pvalue >= 1, 1, bonferroni_pvalue), + significant_tukey = tukey_p_value <= SIGNIFICANCE_THRESHOLD, + significant_tukey_bonf = bonferroni_pvalue <= SIGNIFICANCE_THRESHOLD) ## TODO: Is this multiple correction reasonable? + + + +``` + +### Exploratory analysis + +> NOTE: #1-2 below are possibly/likely confounded by the wildly different sample sizes in polyA vs stranded expression data. +

+ +**1. Are there any pathway comparisons which are significant under stranded expression but not polyA expression?** +_Yes, 57 pathways are differently-significant between stranded and polyA expression data._ + +```{r, explore-data-types-sig, warning=FALSE} +if(!(running_in_ci)){ + # How many pathway comparisons DIFFER IN SIGNIFICANCE between expression data types? + pathway_diffs %>% + dplyr::select(hallmark_name, data_type, comparison, significant_tukey_bonf) %>% + spread(data_type, significant_tukey_bonf) %>% + filter(polya != stranded) %>% + ungroup()-> diff_sig_stranded_polya + head(diff_sig_stranded_polya) +} +``` + +The scores of pathways whose significant differs are about same order of magnitude, with a couple wild exceptions for __Medulloblastoma-HGAT comparsion__. Also reveals some differences in sign of score. + +```{r, explore-data-types-sig-2, warning=FALSE} +if(!(running_in_ci)){ + diff_sig_stranded_polya %>% + inner_join(pathway_diffs) %>% + dplyr::select(hallmark_name, comparison, data_type, pathway_score_difference) %>% + spread(data_type, pathway_score_difference) %>% + mutate(stranded_polya_score_ratio = stranded/polya) %>% + filter(abs(stranded_polya_score_ratio) > 4) +} +``` + +

+ +**2. Are there any significant pathway comparisons whose SIGN differs between stranded expression and polyA expression??** + +There are (as of v12 data release) **39** pathway comparisons which differ in sign between polyA and stranded expression, largely for comparisons between samples MB-CNS embryonal tumor but there is no single hallmark that stands out. +```{r, explore-data-types-sign, warning=FALSE} +if(!(running_in_ci)){ + # How many pathway comparisons DIFFER IN SIGNIFICANCE between data types? + pathway_diffs %>% + dplyr::select(hallmark_name, data_type, comparison, pathway_score_difference) %>% + mutate(score_sign = sign(pathway_score_difference)) %>% + dplyr::select(-pathway_score_difference) %>% + spread(data_type, score_sign) %>% + filter(polya != stranded) %>% + ungroup() -> diff_signs_data_type + + diff_signs_data_type %>% + count(comparison) + + diff_signs_data_type %>% + count(hallmark_name) +} +``` + +

+ +**3. Template visualizing your favorite pathway with your favorite short histology** + +Below is a proposed example visualization for seeing scores and significance for a particular histological type. + +```{r, viz-selected-pathway, fig.width=8, fig.height=6 } + +## TODO: Functionalize with pathway and histology as arguments +## For now as an example, visualize all the MB score differences for adipogenesis pathway. +pathway <- "HALLMARK_ADIPOGENESIS" # for example +histology <- "Medulloblastoma" +pathway_diffs %>% + filter(hallmark_name == pathway, + str_detect(comparison, histology)) %>% + mutate(change_sign = str_detect(comparison, paste0("-", histology)), ##### If histology is the SECOND in comparison, we'll need to change its sign for consistency. We don't want to be using separate here in case there are dashes in the histologies themselves. + pathway_score_difference = ifelse(change_sign, -1 * change_sign, pathway_score_difference), + compared_to = str_replace(comparison, paste0(histology,"-"), ""), ## Remove histology of interest + compared_to = str_replace(compared_to, paste0("-", histology), ""),) %>% ## either side removal + dplyr::select(data_type, pathway_score_difference, significant_tukey_bonf, compared_to) %>% + mutate(significant_tukey_bonf = factor(significant_tukey_bonf, levels=c(TRUE, FALSE))) %>% ## Better ordered with T first + ggplot(aes(x = fct_reorder(compared_to, pathway_score_difference), y = pathway_score_difference)) + + geom_col(aes(fill = significant_tukey_bonf)) + + facet_wrap(~data_type, ncol=2) + + geom_hline(yintercept=0) + + labs( + x = paste(histology, "Comparisons"), + y = "GSVA score difference", + fill = "Significant difference", + title = paste("GSVA score comparisons for", histology, "samples") + ) + + coord_flip() +``` + + + diff --git a/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.html b/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.html new file mode 100644 index 0000000000..58a72db606 --- /dev/null +++ b/analyses/gene-set-enrichment-analysis/02-exploratory-gsea.html @@ -0,0 +1,1900 @@ + + + + + + + + + + + + + + + +GSVA Score Modeling and Exploratory Analysis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +
+

Purpose

+

Models and explores GSVA scores. This code is NOT completed and should be regarded as a work in progress.

+
+
+

Usage

+

To run this from the command line, use:

+
Rscript -e "rmarkdown::render('analyses/gene-set-enrichment-analysis/02-exploratory-gsea.Rmd', clean = TRUE)" 
+

This assumes you are in the top directory of the repository.

+
+
+

Setup

+

Load libraries and define certain constants:

+
###TODO: Should we just load tidyverse? All the core are used
+library(tidyverse)
+#library(dplyr)
+#library(tidyr)
+#library(readr)
+#library(tibble)
+#library(purrr)
+#library(stringr)
+#library(forcats)
+#library(ggplot2)
+library(broom)
+
+
+# Magrittr pipe
+`%>%` <- dplyr::`%>%`
+
+# Significance testing universal threshold
+SIGNIFICANCE_THRESHOLD   <- 0.01
+
+# Are we testing?
+if( !(params$is_ci %in% c(0,1)) & !(params$is_ci %in% c(TRUE, FALSE)) ){
+  stop("\n\nERROR: The parameter `is_ci` should be 0/1 (or FALSE/TRUE).")
+}
+# Does this get around the assignment problem?
+running_in_ci <- params$is_ci
+if (running_in_ci == 0) running_in_ci <- FALSE
+if (running_in_ci == 1) running_in_ci <- TRUE
+


Next, define directories and load data files:

+
### Define directories
+data_dir    <- file.path("..", "..", "data") 
+results_dir <- "results"
+
+######### Define input files
+## Metadata file (histologies/clinical data)
+metadata_file <- file.path(data_dir, "pbta-histologies.tsv")
+
+## GSEA scores
+scores_stranded_file <- file.path(results_dir, "gsva_scores_stranded.tsv")
+scores_polya_file <- file.path(results_dir, "gsva_scores_polya.tsv")
+
+######## Define output files
+#PENDING
+
+
+######## Load input files
+metadata        <- readr::read_tsv(metadata_file)
+scores_stranded <- readr::read_tsv(scores_stranded_file)
+scores_polya    <- readr::read_tsv(scores_polya_file)
+
+
+

Begin modeling

+
### Merge histology metadata with each set of gsea scores
+scores_stranded  <- scores_stranded %>% mutate(data_type = "stranded")
+scores_polya     <- scores_polya %>% mutate(data_type = "polya")
+all_scores      <- bind_rows(scores_stranded, scores_polya) %>%
+                        mutate(data_type    = factor(data_type),
+                        hallmark_name = factor(hallmark_name))
+
+metadata_with_gsva <- metadata %>%
+                        filter(experimental_strategy == "RNA-Seq") %>%
+                        inner_join(all_scores, by = "Kids_First_Biospecimen_ID" )
+
+### If we are running in CI, we need to ensure enough levels for AOV.
+### PolyA data generally does NOT have enough samples, so this should do it.
+if (running_in_ci)
+{
+  metadata_with_gsva %>% 
+    filter(data_type == "stranded") -> metadata_with_gsva
+}
+
+
+
+### Defines a function for performing Anova/Tukey for identifying significant pathways
+## TODO: Increase flexibility so that predictor does NOT NEED TO BE short_histology
+gsva_histology_anova_tukey <- function(df)
+{
+  aov_fit <- aov(gsea_score ~ short_histology, data = df)
+  TukeyHSD(aov_fit) %>%  
+    broom::tidy() %>%
+    dplyr::select(comparison, estimate, adj.p.value) %>%
+    dplyr::rename(pathway_score_difference = estimate,
+                  tukey_p_value            = adj.p.value) 
+}
+
+
+## Conduct series of ANOVA and posthoc Tukey tests for assessing which short histology groups show differences within EACH pathway (hallmark)
+number_of_tests <- metadata_with_gsva %>% 
+                    dplyr::select(hallmark_name, data_type) %>%
+                    distinct() %>% 
+                    nrow()
+
+pathway_diffs <- metadata_with_gsva %>%
+                    group_by(hallmark_name, data_type) %>%
+                    nest() %>%
+                    mutate(anova_tukey = map(data, gsva_histology_anova_tukey)) %>%
+                    dplyr::select(-data) %>%
+                    unnest(anova_tukey) %>%
+                    ungroup() %>%
+                    mutate(bonferroni_pvalue = tukey_p_value * number_of_tests,
+                           bonferroni_pvalue = ifelse(bonferroni_pvalue >= 1, 1, bonferroni_pvalue), 
+                           significant_tukey = tukey_p_value <= SIGNIFICANCE_THRESHOLD,
+                           significant_tukey_bonf = bonferroni_pvalue <= SIGNIFICANCE_THRESHOLD) ## TODO: Is this multiple correction reasonable? 
+
+
+

Exploratory analysis

+
+

NOTE: #1-2 below are possibly/likely confounded by the wildly different sample sizes in polyA vs stranded expression data.

+
+

1. Are there any pathway comparisons which are significant under stranded expression but not polyA expression? Yes, 57 pathways are differently-significant between stranded and polyA expression data.

+
if(!(running_in_ci)){
+  # How many pathway comparisons DIFFER IN SIGNIFICANCE between expression data types?
+  pathway_diffs %>% 
+    dplyr::select(hallmark_name, data_type, comparison, significant_tukey_bonf) %>%
+    spread(data_type, significant_tukey_bonf) %>% 
+    filter(polya != stranded) %>%
+    ungroup()-> diff_sig_stranded_polya
+  head(diff_sig_stranded_polya)
+}
+
+ +
+

The scores of pathways whose significant differs are about same order of magnitude, with a couple wild exceptions for Medulloblastoma-HGAT comparsion. Also reveals some differences in sign of score.

+
if(!(running_in_ci)){
+  diff_sig_stranded_polya %>%
+    inner_join(pathway_diffs) %>%
+    dplyr::select(hallmark_name, comparison, data_type, pathway_score_difference) %>%
+    spread(data_type, pathway_score_difference) %>%
+    mutate(stranded_polya_score_ratio = stranded/polya) %>% 
+    filter(abs(stranded_polya_score_ratio) > 4)
+}
+
## Joining, by = c("hallmark_name", "comparison")
+
+ +
+



+

2. Are there any significant pathway comparisons whose SIGN differs between stranded expression and polyA expression??

+

There are (as of v12 data release) 39 pathway comparisons which differ in sign between polyA and stranded expression, largely for comparisons between samples MB-CNS embryonal tumor but there is no single hallmark that stands out.

+
if(!(running_in_ci)){
+  # How many pathway comparisons DIFFER IN SIGNIFICANCE between data types?
+  pathway_diffs %>% 
+    dplyr::select(hallmark_name, data_type, comparison, pathway_score_difference) %>%
+    mutate(score_sign = sign(pathway_score_difference)) %>%
+    dplyr::select(-pathway_score_difference) %>%
+    spread(data_type, score_sign) %>% 
+    filter(polya != stranded) %>%
+    ungroup() -> diff_signs_data_type
+  
+  diff_signs_data_type %>% 
+    count(comparison) 
+  
+  diff_signs_data_type %>% 
+    count(hallmark_name)
+}
+
+ +
+



+

3. Template visualizing your favorite pathway with your favorite short histology

+

Below is a proposed example visualization for seeing scores and significance for a particular histological type.

+
## TODO: Functionalize with pathway and histology as arguments
+## For now as an example, visualize all the MB score differences for adipogenesis pathway.
+pathway <- "HALLMARK_ADIPOGENESIS" # for example
+histology <- "Medulloblastoma"
+pathway_diffs %>%
+  filter(hallmark_name == pathway, 
+          str_detect(comparison, histology)) %>%
+  mutate(change_sign = str_detect(comparison, paste0("-", histology)),   ##### If histology is the SECOND in comparison, we'll need to change its sign for consistency. We don't want to be using separate here in case there are dashes in the histologies themselves.
+         pathway_score_difference = ifelse(change_sign, -1 * change_sign, pathway_score_difference),
+         compared_to = str_replace(comparison, paste0(histology,"-"), ""), ## Remove histology of interest
+         compared_to = str_replace(compared_to, paste0("-", histology), ""),) %>% ## either side removal
+  dplyr::select(data_type, pathway_score_difference, significant_tukey_bonf, compared_to) %>%
+  mutate(significant_tukey_bonf = factor(significant_tukey_bonf, levels=c(TRUE, FALSE))) %>% ## Better ordered with T first
+  ggplot(aes(x = fct_reorder(compared_to, pathway_score_difference), y = pathway_score_difference)) + 
+    geom_col(aes(fill = significant_tukey_bonf)) + 
+    facet_wrap(~data_type, ncol=2) + 
+    geom_hline(yintercept=0) + 
+    labs(
+      x = paste(histology, "Comparisons"),
+      y = "GSVA score difference", 
+      fill = "Significant difference",
+      title = paste("GSVA score comparisons for", histology, "samples")
+    ) + 
+  coord_flip()
+

+ +
+ + + + +
+ + + + + + + + diff --git a/analyses/gene-set-enrichment-analysis/README.md b/analyses/gene-set-enrichment-analysis/README.md index 31b9b2e07f..4b07dad0be 100644 --- a/analyses/gene-set-enrichment-analysis/README.md +++ b/analyses/gene-set-enrichment-analysis/README.md @@ -8,7 +8,9 @@ Written by Stephanie J. Spielman to supercede previous analyses in [`ssgsea-hall ## Folder Content -+ `01-conduct-gsea-analysis.Rmd` performs the GSVA analysis using RSEM FPKM expression data. Results are saved the TSV file `results/gsea_scores.tsv`. ++ `01-conduct-gsea-analysis.R` performs the GSVA analysis using RSEM FPKM expression data for both stranded and polyA data. Results are saved in `results/` TSV files. + ++ `02-gsea-explore.Rmd` performs some initial exploratory analyses on GSVA scores including modeling and visualization. + `results/gsva_scores_stranded.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds` (data release v12) + File created with: `Rscript --vanilla 01-conduct-gsea-analysis.R --input pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds --output gsva_scores_stranded.tsv` diff --git a/analyses/gene-set-enrichment-analysis/run-gsea.sh b/analyses/gene-set-enrichment-analysis/run-gsea.sh index 3465c6a582..304519857c 100644 --- a/analyses/gene-set-enrichment-analysis/run-gsea.sh +++ b/analyses/gene-set-enrichment-analysis/run-gsea.sh @@ -1,9 +1,14 @@ ######################################################################### # Stephanie J. Spielman for ALSF CCDL 2020 # -# Run the GSEA pipeline, currently just `01-conduct-gsea-analysis.R` +# Run the GSEA pipeline: +## 1. `01-conduct-gsea-analysis.R` to calculate scores +## 2. `02-exploratory-gsea.Rmd` to explore the scores, lightly for now # -# Usage: bash run-gsea.sh +# Usage: bash run-gsea.sh +# +# Takes one environment variable, `OPENPBTA_TESTING`, which is 1 for running +# samples in CI for testing, 0 for running the full dataset (Default) # ######################################################################### @@ -12,6 +17,8 @@ set -e set -o pipefail +IS_CI=${OPENPBTA_TESTING:-0} + # This script should always run as if it were being called from # the directory it lives in. script_directory="$(perl -e 'use File::Basename; @@ -30,3 +37,7 @@ Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OU INPUT_FILE="pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds" OUTPUT_FILE="gsva_scores_stranded.tsv" Rscript --vanilla 01-conduct-gsea-analysis.R --input ${INPUT_FILE} --output ${OUTPUT_FILE} + + +######## Exploratory analysis of GSVA scores ############ +Rscript -e "rmarkdown::render('02-exploratory-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))" diff --git a/analyses/sex-prediction-from-RNASeq/03-evaluate_model.R b/analyses/sex-prediction-from-RNASeq/03-evaluate_model.R index f9341c3650..923a8a4cd1 100644 --- a/analyses/sex-prediction-from-RNASeq/03-evaluate_model.R +++ b/analyses/sex-prediction-from-RNASeq/03-evaluate_model.R @@ -178,7 +178,7 @@ saveRDS(cm, cm_file) # The underlying cause should be investigated more thoroughly, as we would # like the module code to run reproducibly, e.g., get the same labels given # the same seed. -if (length(levels(cm_set$obs)) > 1) { +if (length(levels(cm_set$pred)) > 1) { two_class_summary <- twoClassSummary(cm_set, lev = levels(cm_set$obs)) cat("\n\n") two_class_summary