Merge branch 'master' into hgg-molecular-subtyping-data-prep

AlexsLemonade · Jan 4, 2020 · e5e4cf2 · e5e4cf2
2 parents 8bdd4c6 + 4a3bb7e
commit e5e4cf2
Show file tree

Hide file tree

Showing 8 changed files with 2,510 additions and 9 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -122,7 +122,7 @@ jobs:
 
       - run:
           name: Gene set enrichment analysis to generate GSVA scores
-          command: ./scripts/run_in_ci.sh bash "analyses/gene-set-enrichment-analysis/run-gsea.sh" 
+          command: OPENPBTA_TESTING=1 ./scripts/run_in_ci.sh bash "analyses/gene-set-enrichment-analysis/run-gsea.sh" 
 
 
          ################################

diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
@@ -1,13 +1,14 @@
 ## Define the ending file(s) that we want
-OUTPUT= expand("../../scratch/interim/{sample}.{caller}.{dupdel}.filtered3.bed",
+OUTPUT= expand("../../scratch/interim/{sample}.{combined_caller}.{dupdel}.bed",
                sample=config["samples"], 
-               caller=["freec", "cnvkit", "manta"],
-               dupdel=["dup", "del"])
+               combined_caller=["manta_cnvkit", "manta_freec", "cnvkit_freec"],
+               dupdel=["dup", "del"]),
 
 ## Define the wildcards to use in the file names. 
 wildcard_constraints:
   caller = "cnvkit|freec|manta",
-  dupdel = "dup|del"
+  dupdel = "dup|del",
+  combined_caller = "manta_cnvkit|manta_freec|cnvkit_freec"
 
 ## Define the first rule of the Snakefile. This rule determines what the final file is and which steps to be taken.
 rule all:
@@ -136,3 +137,28 @@ rule restructure_column:
     threads: 1
     shell:
         "python3 {input.script} --file {input.merged_bed} > {output.restructured_bed}"
+
+
+rule compare_cnv_methods:
+    input:
+        ## Define the location of the input file and take the path/extension from the config file
+        script=os.path.join(config["scripts"], "compare_variant_calling_updated.py"),
+        cnvkit="../../scratch/interim/{sample}.cnvkit.{dupdel}.filtered3.bed",
+        freec="../../scratch/interim/{sample}.freec.{dupdel}.filtered3.bed",
+        manta="../../scratch/interim/{sample}.manta.{dupdel}.filtered3.bed"
+
+    output:
+        ## Define the output files' names
+        manta_cnvkit="../../scratch/interim/{sample}.manta_cnvkit.{dupdel}.bed",
+        manta_freec="../../scratch/interim/{sample}.manta_freec.{dupdel}.bed",
+        cnvkit_freec="../../scratch/interim/{sample}.cnvkit_freec.{dupdel}.bed"
+    threads: 1
+    params:
+        sample_name="{sample}"
+    shell:
+        "python3 {input.script} --manta {input.manta} --cnvkit {input.cnvkit} --freec {input.freec} "
+        "--manta_cnvkit {output.manta_cnvkit} --manta_freec {output.manta_freec} --cnvkit_freec {output.cnvkit_freec} "
+        "--sample {params.sample_name}"
+
+
+
diff --git a/analyses/copy_number_consensus_call/src/scripts/compare_variant_calling_updated.py b/analyses/copy_number_consensus_call/src/scripts/compare_variant_calling_updated.py
@@ -0,0 +1,319 @@
+################## PURPOSE #################
+
+# This script is to do PAIR comparisons using the three 3 caller CNVs files
+# This script takes in the 3 caller CNVs files and put their content in to a nested dictionary.
+# There are 3 dictionaries within the bigger dictionary , each dictionary is the content of a singular caller method.
+
+# For-loops are used to loop through these 3 files to do pair comparison of the 3 files.
+# We compare manta - cnvkit
+# then  manta - freec
+# and finally cnvkit - freec
+# We'll end up with 3 output files.
+
+# The principle for finding consensus CNVs is that if any two CNVs overlap at least 50% reciprocally, we will
+# take the common section of those overlapping CNVs as the new consensus CNV.
+
+################# Output format ###############
+
+#   column1    column2     column3     column4                     column5                     column6                  column7     column8     column9
+#
+#   chr#       consensus   consensus   raw manta coordinates       raw cnvkit coordinates      raw freec coordinates    cnv type    sample      filename
+#              start_pos   end_pos     that made up this CNV       that made up this CNV       that made up this CNV                name
+
+################# ASSUMPTION ###############
+
+# None of the files have overlapping segments within its own file.
+
+############################################
+
+
+
+
+# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports
+# Standard library
+import argparse
+import itertools
+import os
+import re
+import sys
+
+
+# Related third party
+import numpy as np
+import pandas as pd
+
+
+
+########################## DEFINE FUNCTIONS ###############################
+
+
+def read_input_file(input_file):
+    """ Function to read content of input files and output to a dictionary"""
+
+    ## Check to see if the CNVs bed file is empty, if so, make an empty variable
+    if os.stat(input_file).st_size == 0:
+        content_dict = {}
+
+    ## If the file has content, then load in the content
+    else:
+        with open(input_file) as file:
+
+            ## Strip the trailing spaces for each line
+            stripped_content = [line.rstrip() for line in file.readlines()]
+
+            ## Split each line up by any white space between columns
+            fin_input_content = [re.split('\s+',line) for line in stripped_content]
+
+            ## Create a list of unique keys/chromosome
+            dict_key = []
+            [dict_key.append(i[0]) for i in fin_input_content]
+            dict_key_unique = np.unique(dict_key)
+
+            ## Make the dictionary
+            content_dict = {el:[] for el in dict_key_unique}
+
+            ## Populate the dictionary
+            {content_dict[line[0]].append(line[1:]) for line in fin_input_content}
+
+    return content_dict
+
+
+def generate_consensus(list1_name,list1,list2_name,list2):
+    """ Function to take 2 lists of CNVs as inputs, create consensus CNVs,
+    and output a list of consensus CNVs
+
+    Keyword arguments:
+    list1_name -- Name of the 1st CNVs file
+    list1      -- The actual 1st CNVs file
+    list2_name -- Name of the 2nd CNVs file
+    list2      -- The actual 2nd CNVs file
+    """
+
+
+    ## Define a variable to store the list of consensus CNVs
+    consensus_list = []
+
+    ## For every CNVs of list1, we look through all CNVs in list2 that overlaps and perform actions
+    ## The use of dictionary is to speed this process up.\
+    ## Loop through all chromosomes of list1
+    for chr_list1 in list1:
+
+        ## For every CNVs in that chromosome,
+        for m in list1[chr_list1]:
+
+            ## Assign the start_pos, end_pos, and copy number of list1 to variables
+            start_list1 = int(m[0])
+            end_list1 = int(m[1])
+            cnv_list1 = m[2]
+
+            ## Make sure end > start
+            if start_list1 > end_list1:
+                start_list1, end_list1 = end_list1, start_list1
+
+            ## Assign variable for the CULMULATIVE coverage of the current CNV from list1
+            coverage_list1 = 0
+
+            ## Assign variables to store information for list2
+            list2_overlap_len = 0
+            list2_total_len = 0
+            list2_start_list = []
+            list2_end_list = []
+            list2_chr_str_end = ''
+
+
+            ## For every of the CNV in list one, we look over ALL CNVs in list 2 and make comparisons
+            if chr_list1 in list2:
+                for n in list2[chr_list1]:
+
+                    ## Assign start_pos, end_pos, and copy number of list2 to variables
+                    start_list2 = int(n[0])
+                    end_list2 = int(n[1])
+                    cnv_list2 = n[2]
+
+                    ## Make sure end > start
+                    if start_list2 > end_list2:
+                        start_list2, end_list2 = end_list2, start_list2
+
+                    ## For the current CNV in list1, if it overlaps with any CNV from list2
+                    if start_list1 <= end_list2 and end_list1 >= start_list2:
+
+                        ## Take the common region and store them in variables
+                        start = max(start_list1,start_list2)
+                        end = min(end_list1,end_list2)
+
+                        ## For list1's CNV,
+                        ## if there are any overlaps at all - immediately add the coverage into the overall coverage for that specific CNV of list1
+                        coverage_list1 += (end - start + 1) / (end_list1 - start_list1 + 1)
+
+                        ## For list2's CNV
+                        ## If the overlapping covers >= 50% of its length,
+                        ## then we add in the start, end coordinate, total overlap length, and total len to different lists
+                        ## This is done to account for 1 CNV from list1 overlapping with MULTIPLE CNVs from list2
+                        if (end - start +1) / (end_list2 - start_list2 + 1) >= 0.5:
+
+                            ## Add the overlapping length to a list of overlap length
+                            list2_overlap_len += (end - start + 1)
+
+                            ## Add the total length to a list of total length
+                            list2_total_len += (end_list2 - start_list2 + 1)
+
+                            ## Add start_pos and end_pos to a list to choose from later on
+                            list2_start_list += [start_list2]
+                            list2_end_list += [end_list2]
+
+                            ## Add the raw coordinates to a growing list that gets output later on to the final consensus file
+                            list2_chr_str_end += str(cnv_list2)
+
+
+
+
+                ## The coverage of list1 is already calculated as the "coverage_list1" variable
+                ## ASSUMING there is no overlaping segments within 1 method
+                ## then it is safe to add/accumulate the coverage_list1 value. Refer to assumptions (line 23)
+
+                ## Next we calculate the coverage of list2 since there might be more than 1 overlapping CNV from list2
+                if list2_total_len > 0:
+                    coverage_list2 = list2_overlap_len / list2_total_len
+                else:
+                    coverage_list2 = 0
+
+                ## Check to see if the overlap is >= 50% RECIPROCALLY
+                if coverage_list1 >= 0.5 and coverage_list2 >= 0.5:
+                    ## if it is 50% reciprocally, we chose from the list2 starts and ends
+                    ## we take out the smallest start and biggest end
+                    ## This maximize the CNV length from list2
+                    fin_start_list2 = min(list2_start_list)
+                    fin_end_list2 = max(list2_end_list)
+
+                    ## Taking only the common region as the final segment.
+                    ## Compare between list2 vs list1 start and end - take the common segment
+                    chrom_start = max(fin_start_list2,start_list1)
+                    chrom_end = min(fin_end_list2,end_list1)
+
+                    ## Format the output consensus CNV
+                    ## Column 4 is manta's raw coordinates
+                    ## Column 5 is cnvkit's raw coordinates
+                    ## Column 6 is freec's raw coordinates
+
+                    ## if the files input are MANTA and CNVKIT, put info in column 4 and 5, 6th column is null
+                    if (list1_name == 'manta' and list2_name == 'cnvkit') or (list1_name == 'cnvkit' and list2_name == 'manta'):
+                        overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),str(cnv_list1).strip(','),list2_chr_str_end.strip(','),'NULL',m[-1]]
+
+                    ## if the files input are MANTA and FREEC, put info in column 4 and 6, 5th column is null
+                    elif (list1_name == 'manta' and list2_name == 'freec') or (list1_name == 'freec' and list2_name == 'manta'):
+                        overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),str(cnv_list1).strip(','),'NULL',list2_chr_str_end.strip(','),m[-1]]
+
+                    ## if the files input are CNVKIT and FREEC, put info in column 5 and 6, 4th column is null
+                    elif (list1_name == 'cnvkit' and list2_name == 'freec') or (list1_name == 'freec' and list2_name == 'cnvkit'):
+                        overlap_chrom = [chr_list1,str(chrom_start),str(chrom_end),'NULL',str(cnv_list1).strip(','),list2_chr_str_end.strip(','),m[-1]]
+
+                    ## Add the formatted consensus CNV into a growing list of consensus CNVs
+                    consensus_list = consensus_list + [overlap_chrom]
+
+    return consensus_list
+
+
+
+def save_to_file(output_file_content, output_path, sample_name):
+    """ Function that takes in content, output file path, and sample name to save the content to a file1
+
+    Keyword arguments:
+    output_file_content -- content of consensus
+    output_path         -- path and name of the output file
+    sample_name         -- sample name of the consensus CNVs.
+    """
+
+    ## Open up the file to write in it, the 'w' option overwrites the file if the file exists.
+    with open( output_path, 'w') as file:
+
+        ## Add the sample name and file name to each line
+        single_name = os.path.basename(output_path)
+
+        ## Loop through the output file and print line by line
+        for k in output_file_content:
+
+            ## Join the sample_name and single_name(file name) to the CNV info
+            k.extend([sample_name, single_name,'\n'])
+            file.write('\t'.join(k))
+
+    sys.stderr.write('$$$ Write to file ' + str(output_path) + ' was sucessful\n')
+
+
+
+################### SCRIPT BODY #################################
+
+
+
+## Define parser for the input file
+parser = argparse.ArgumentParser(description="""This script takes in 3 bed files, each from one of the 3 callers
+                                                 and find common CNVs between two files at a time.""")
+parser.add_argument('--manta', required=True,
+                    help='path to the manta file')
+parser.add_argument('--cnvkit', required=True,
+                    help='path to the cnvkit file')
+parser.add_argument('--freec', required=True,
+                    help='path to the freec file')
+parser.add_argument('--manta_cnvkit', required=True,
+                    help='path of the output consensus between manta and cnvkit')
+parser.add_argument('--manta_freec', required=True,
+                    help='path of the output consensus between manta and freec')
+parser.add_argument('--cnvkit_freec', required=True,
+                    help='path of the output consensus between cnvkit and freec')
+parser.add_argument('--sample', default='no_sample',
+                    help='sample name to use in the file')
+
+args = parser.parse_args()
+
+
+## Define a list of input callers
+
+input_callers = ['manta', 'cnvkit', 'freec']
+input_content = dict()
+for caller in input_callers:
+    ## Read in the input files as dictionaries
+    input_content[caller] = read_input_file(getattr(args, caller))
+
+
+
+
+
+## Put the output file paths into their own lists that is to be iterated over
+## This order is important
+## Make a list for pairs of callers to make consensus for
+caller_pairs = list(itertools.combinations(input_callers,2))
+# generate output list in the same order
+output_files = [getattr(args, "_".join(callers)) for callers in caller_pairs]
+
+## Make a list for pairs of callers to make consensus for
+
+
+## We have 3 items in input_content. Each item is the content of an input file
+## Make a list to store the final files
+fin_list =[]
+
+## Loop through list_index
+for caller1, caller2 in caller_pairs:
+
+    ## Compare 1st and 2nd file from list_of_input_contents
+    ## Then compare the 1st and 3rd
+    ## Then compare the 2nd and 3rd
+    list1 = input_content[caller1]
+    list2 = input_content[caller2]
+
+
+    ## Call the "generate_consensus" function to get consensus CNVs
+    consensus_list = generate_consensus(caller1,list1,caller2,list2)
+
+
+
+    ## Add the consensus list into a final list that hold the
+    ## content of 3 files - manta_cnvkit , manta_freec, and cnvkit_freec
+    fin_list.append(consensus_list)
+
+
+
+## Print the output into files
+for content, outfile in zip(fin_list, output_files):
+
+    ## Call the "save_to_file" function to print each consensus content to a file.
+    save_to_file(content, outfile, args.sample)