From 3f55855590fd7f1afc352335143acba001683a29 Mon Sep 17 00:00:00 2001
From: Duong <duongn@n3-84-192.dhcp.drexel.edu>
Date: Wed, 18 Dec 2019 03:51:55 -0500
Subject: [PATCH 1/6] add to Snakefile

---
 analyses/copy_number_consensus_call/Snakefile | 39 ++++++++++-
 .../src/scripts/restructure_column.py         | 67 +++++++++++++++++++
 2 files changed, 103 insertions(+), 3 deletions(-)
 create mode 100644 analyses/copy_number_consensus_call/src/scripts/restructure_column.py

diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
index de1eda7075..d19fee374b 100644
--- a/analyses/copy_number_consensus_call/Snakefile
+++ b/analyses/copy_number_consensus_call/Snakefile
@@ -1,6 +1,3 @@
-# Nhat Duong -- 
-# Updated Dec 5, 2019
-
 ## Define the ending file(s) that we want
 OUTPUT= expand("../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed",
                sample=config["samples"], 
@@ -102,6 +99,7 @@ rule filter_telomeres:
         ## Define the output files' names
         filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed"
     wildcard_constraints:
+        ## Define the wildcards to use in the file names. 
         caller = "cnvkit|freec|manta",
         dupdel = "del|dup"
     threads: 1
@@ -109,3 +107,38 @@ rule filter_telomeres:
         ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. 
         "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}"
 
+
+rule first_merge:
+    input:
+        ## Define the location of the input file and take the path/extension from the config file
+        filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed"
+    output:
+        ## Define the output files' names
+        merged_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered2.bed"
+    wildcard_constraints:
+        ## Define the wildcards to use in the file names. 
+        caller = "cnvkit|freec|manta",
+        dupdel = "del|dup"
+    threads: 1
+    shell:
+        ## Call on bedtools to merge any overlapping segment. Merging done for any segments within a single file.
+        ## Any condiders any segments within 10,000 bp to be the same CNV. 
+        ## Merge but retain info from columns 2 (start pos), 3(end pos), 5(copy numbers), 7(CNV type)
+        "bedtools merge -i {input.filtered_bed} -d 10000 -c 2,3,5,7 -o collapse,collapse,collapse,distinct > {output.merged_bed}"
+
+
+rule restructure_column:
+    input:
+        ## Define the location of the input file and take the path/extension from the config file
+        script=os.path.join(config["scripts"], "restructure_column.py"),
+        merged_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered2.bed"
+    output:
+        ## Define the output files' names
+        restructured_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered3.bed"
+    wildcard_constraints:
+        ## Define the wildcards to use in the file names. 
+        caller = "cnvkit|freec|manta",
+        dupdel = "del|dup"
+    threads: 1
+    shell:
+        "python3 {input.script} --file {input.merged_bed} > {output.restructured_bed}"
diff --git a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py
new file mode 100644
index 0000000000..b598f0b143
--- /dev/null
+++ b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py
@@ -0,0 +1,67 @@
+################## PURPOSE #################
+# This script is to restructure the 4th(start pos),5th(end pos), and 6th(copy number) columns of the input file
+# Because these columns hold the "raw" coordinates of the CNV after filtering out for IGLL, telomerics, centromeric
+# and segup regions, we want to retain these info for any analysis further down stream.
+# Thus this script format columns 4, 5, and 6 of a CNV into ---  start_pos:end_pos:copy_number  ----
+# So if a CNV is part of a FINAL consensus CNV, its (start_pos:end_pos:copy_number) information will
+# be carried through and contained in the FINAL FILE
+
+# For example
+#          column 4                   column 5             column 6
+# 2777350,6035598,6807639       6035597,6807638,7659545     3,3,3
+
+# Will become 2777350:6035597:3,6035598:6807638:3,6807639:7659545:3,
+################# ASSUMPTION ###############
+
+# The bed file has no header
+
+############################################
+
+# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports
+# Standard library
+import argparse
+import sys
+import os
+
+# Related third party
+import numpy as np
+import pandas as pd
+
+## Define parser for the input file
+parser = argparse.ArgumentParser(description="""This script restructure the 4th, 5th, and 6th columns""")
+parser.add_argument('--file', required=True,
+                    help='path to the bed file that needs to be restructred')
+
+args = parser.parse_args()
+
+## Use pandas to read in the bed file.
+## Since Manta doesn't have copy numbers and "NA" is used for Manta's copy numbers, we need
+## the "keep_default_na" so that the "NA"s don't get converted into NaN by pandas
+file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False)
+
+## Make a new column to store info in the new format (start_pos:end_pos:copy_number)
+new_column = file.iloc[:,(0)].copy()
+
+## For every CNV in the file, reformatt columns 4, 5, and 6
+for i_index,i in enumerate(file.itertuples()):
+    ## Since bed file uses a comma as default for collapsing information, we split the information using ','
+    list_start = i._4.split(',')
+    list_end = i._5.split(',')
+    list_cn = str(i._6).split(',')
+
+    ## Initialize a variable to store the new restructred information
+    growing_list = ''
+
+    ## Rearange the split information into the new format
+    for j in range(0,len(list_start)):
+        growing_list += '{}:{}:{},'.format(list_start[j],list_end[j],list_cn[j])
+
+    ## Add the list to the new column
+    new_column.iloc[i_index] = growing_list
+
+
+## Make a new file by combining columns 1,2,3, the new column, and the copy number type column.
+new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1))
+
+## Output restructred file to stdout
+new_mat.to_csv(sys.stdout, sep='\t', index=False, header=False)

From 3a20aa0690c9fdf60c316037aba0d7b416123ba4 Mon Sep 17 00:00:00 2001
From: Duong <duongn@n3-84-192.dhcp.drexel.edu>
Date: Sat, 4 Jan 2020 13:04:44 -0500
Subject: [PATCH 2/6] updating fork

---
 .../src/scripts/restructure_column.py         | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py
index b598f0b143..f848168315 100644
--- a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py
+++ b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py
@@ -34,34 +34,37 @@
 
 args = parser.parse_args()
 
+
+## Check to see if the bed file is empty, if so, make an empty dataframe
+if os.stat(args.file).st_size == 0:
+    new_mat = pd.DataFrame(columns=['a','b'])
+
 ## Use pandas to read in the bed file.
 ## Since Manta doesn't have copy numbers and "NA" is used for Manta's copy numbers, we need
 ## the "keep_default_na" so that the "NA"s don't get converted into NaN by pandas
-file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False)
+else:
+    file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False)
 
-## Make a new column to store info in the new format (start_pos:end_pos:copy_number)
-new_column = file.iloc[:,(0)].copy()
+    ## Make a new column to store info in the new format (start_pos:end_pos:copy_number)
+    new_column = file.iloc[:,(0)].copy()
 
-## For every CNV in the file, reformatt columns 4, 5, and 6
-for i_index,i in enumerate(file.itertuples()):
-    ## Since bed file uses a comma as default for collapsing information, we split the information using ','
-    list_start = i._4.split(',')
-    list_end = i._5.split(',')
-    list_cn = str(i._6).split(',')
+    ## For every CNV in the file, reformat columns 4, 5, and 6
+    for i_index,i in enumerate(file.itertuples()):
+        ## Since bed file uses a comma as default for collapsing information, we split the information using ','
+        list_start = str(i._4).split(',')
+        list_end = str(i._5).split(',')
+        list_cn = str(i._6).split(',')
 
-    ## Initialize a variable to store the new restructred information
-    growing_list = ''
+        ## Rearange the split information into the new format
+        cnv_list = ''.join(['{}:{}:{},'.format(*cnv) for cnv in zip(list_start, list_end, list_cn)])
 
-    ## Rearange the split information into the new format
-    for j in range(0,len(list_start)):
-        growing_list += '{}:{}:{},'.format(list_start[j],list_end[j],list_cn[j])
 
-    ## Add the list to the new column
-    new_column.iloc[i_index] = growing_list
+        ## Add the list to the new column
+        new_column.iloc[i_index] = cnv_list
 
 
-## Make a new file by combining columns 1,2,3, the new column, and the copy number type column.
-new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1))
+    ## Make a new file by combining columns 1,2,3, the new column, and the copy number type column.
+    new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1))
 
 ## Output restructred file to stdout
 new_mat.to_csv(sys.stdout, sep='\t', index=False, header=False)

From 305bbbf3f428a538816e0e4744f6b34faff9cc35 Mon Sep 17 00:00:00 2001
From: Duong <duongn@a45e60e02445.chop.edu>
Date: Mon, 6 Jan 2020 10:01:10 -0500
Subject: [PATCH 3/6] changed output path and name

---
 analyses/copy_number_consensus_call/Snakefile | 52 +++++++++++++++++--
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
index f19d088683..1aea22cd0a 100644
--- a/analyses/copy_number_consensus_call/Snakefile
+++ b/analyses/copy_number_consensus_call/Snakefile
@@ -1,8 +1,6 @@
 ## Define the ending file(s) that we want
-OUTPUT= expand("../../scratch/interim/{sample}.{combined_caller}.{dupdel}.bed",
-               sample=config["samples"], 
-               combined_caller=["manta_cnvkit", "manta_freec", "cnvkit_freec"],
-               dupdel=["dup", "del"]),
+OUTPUT= expand("results/cnv_consensus/{sample}.cnv_consensus.csv",
+               sample=config["samples"]),
 
 ## Define the wildcards to use in the file names. 
 wildcard_constraints:
@@ -161,4 +159,50 @@ rule compare_cnv_methods:
         "--sample {params.sample_name}"
 
 
+rule combine_files:
+    input:
+        ## Define the location of the input file
+        manta_cnvkit="../../scratch/interim/{sample}.manta_cnvkit.{dupdel}.bed",
+        manta_freec="../../scratch/interim/{sample}.manta_freec.{dupdel}.bed",
+        cnvkit_freec="../../scratch/interim/{sample}.cnvkit_freec.{dupdel}.bed"
+    output:
+        ## Define the output files' names
+        dup_del_file="../../scratch/interim/{sample}.combined.{dupdel}.sorted.bed"
+    threads: 1
+    shell:
+        ## Combine the input file, sort and output to one file
+        "cat {input.manta_cnvkit} {input.manta_freec} {input.cnvkit_freec} | sort -k1,1 -k2,2n > {output.dup_del_file}"
+
+
+rule merge:
+    input:
+        ## Define the location of the input file
+        delete="../../scratch/interim/{sample}.combined.del.sorted.bed",
+        dup="../../scratch/interim/{sample}.combined.dup.sorted.bed"
+    output:
+        ## Define the output files' names
+        delete="../../scratch/interim/{sample}.del.merged.bed",
+        dup="../../scratch/interim/{sample}.dup.merged.bed",
+        combined_dup_del="../../scratch/endpoints/{sample}.del_dup.merged.bed"
+    threads: 1
+    shell:
+        ## Merge the consensus files together
+        ## Put the dup CNVs calls and del CNVs calls together into one file
+        "bedtools merge -i {input.delete} -c 4,5,6,7,8,9 -o collapse,collapse,collapse,distinct,distinct,collapse > {output.delete} && \
+        bedtools merge -i {input.dup} -c 4,5,6,7,8,9 -o collapse,collapse,collapse,distinct,distinct,collapse > {output.dup} && \
+        cat {output.delete} {output.dup} > {output.combined_dup_del}"c
+
+
+rule add_column_names:
+    input:
+        ## Define the location of the input file and take the path/extension from the config file
+        script=os.path.join(config["scripts"], "give_column_names.py"),
+        combined_dup_del="../../scratch/endpoints/{sample}.del_dup.merged.bed"
+    output:
+        ## Define the output files' names
+        final_dup_del="results/cnv_consensus/{sample}.cnv_consensus.csv"
+    threads: 1
+    shell:
+        ## Name the headers
+        "python3 {input.script} --file {input.combined_dup_del} > {output.final_dup_del}"
 

From 8f3e136ee320626f78b2cf61cc24d81a9eed6e26 Mon Sep 17 00:00:00 2001
From: Duong <duongn@a45e60e02445.chop.edu>
Date: Mon, 13 Jan 2020 12:10:54 -0500
Subject: [PATCH 4/6] add bedtools subtract and new result file

---
 analyses/copy_number_consensus_call/Snakefile              | 7 +++----
 .../copy_number_consensus_call/results/cnv_consensus.tsv   | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
index a50eb0c06b..e642557846 100644
--- a/analyses/copy_number_consensus_call/Snakefile
+++ b/analyses/copy_number_consensus_call/Snakefile
@@ -94,7 +94,6 @@ rule manta_filter:
 rule filter_telomeres:
     input:
         ## Define the location of the input file and take the path/extension from the config file
-        script=os.path.join(config["scripts"], "get_rid_bad_segments.py"),
         bad_list=os.path.join(config["scripts"], "bad_chromosomal_seg_updated_merged.bed"),
         bedfile="../../scratch/interim/{sample}.{caller}.{dupdel}.bed"
     output:
@@ -102,8 +101,8 @@ rule filter_telomeres:
         filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed"
     threads: 1
     shell:
-        ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. 
-        "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}"
+        ## Invoke the bedtools subtract pass in the reference and CNVs files. Direct the stdout to a new file. 
+        "bedtools subtract -N -a {input.bedfile} -b {input.bad_list} -f 0.5 > {output.filtered_bed}"
 
 
 rule first_merge:
@@ -203,4 +202,4 @@ rule clean_output:
     output:
         "results/cnv_consensus.tsv"
     shell:
-        "python3 {input.script} --file {input.cnv_file} > {output}"
+        "python3 {input.script} --file {input.cnv_file} > {output}"
\ No newline at end of file
diff --git a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv
index f46a941b3c..8f82c6f4df 100644
--- a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv
+++ b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv
@@ -3167,7 +3167,7 @@ chr3	84952630	87955725	NULL	84950794:86911163:3,86911164:88297478:3	84952630:861
 chr3	93649137	162808158	NULL	93649137:144671157:3,144671158:148925621:3,148925622:162808158:3	93512945:94627725:3,94627725:94631200:3,94631200:94770200:3,94770895:96420825:3,96421520:96702995:3,96702995:96706470:3,96706470:98244505:3,98244505:98249370:3,98249370:99072250:3,99072250:99227235:3,99227235:99343995:3,99343995:99347470:3,99347470:100430280:3,100430280:100435145:3,100435145:100795850:3,100795850:100827820:3,100827820:102084380:3,102085075:102991355:3,102991355:103832305:3,103833000:103955320:3,103955320:103981730:3,103981730:104394560:3,104395950:104569005:3,104569005:105774830:3,105775525:106378090:3,106378785:108032885:3,108034970:108318530:3,108319225:109156005:3,109156005:109263730:3,109266510:110058810:3,110058810:110062980:3,110062980:110371560:3,110371560:110794815:3,110796900:111147875:3,111147875:111152740:3,111152740:111361240:3,111361240:111872760:3,111872760:112468375:3,112471155:112716490:3,112717880:114292055:3,114292055:115048910:3,115048910:115053775:3,115053775:116300605:3,116301995:116589030:3,116589725:118575340:3,118575340:118686540:3,118689320:119395440:3,119395440:119533745:3,119535135:120095305:3,120095305:120916795:3,120917490:121556195:3,121556195:121561060:3,121561060:123653705:3,123653705:123659265:3,123659265:124059585:3,124061670:124968645:3,124968645:125040230:3,125040925:125204250:3,125204250:125414835:3,125414835:125418310:3,125418310:125664340:3,125871450:126530310:3,126530310:126677650:3,126677650:126696415:3,126696415:126969550:3,126970245:127350410:3,127350410:129039260:3,129041345:129350620:3,129357570:129583445:3,129584140:130044925:3,130079675:130387560:3,130388950:134391455:3,134391455:134415085:3,134415085:135905165:3,135905165:136818395:3,136818395:136947665:3,136947665:136952530:3,136952530:137153385:3,137153385:137185355:3,137185355:137791395:3,137792090:138124995:3,138125690:138409945:3,138411335:139671370:3,139672065:139769365:3,139769365:141656290:3,141656290:142283875:3,142283875:142694620:3,142694620:142698790:3,142698790:144671895:3,144671895:145747755:3,145750535:146042435:3,146042435:146230780:3,146232170:146367695:3,146367695:148630615:3,148630615:148637565:3,148638260:148773090:3,148773090:153836165:3,153836860:154109995:3,154116945:156622420:3,156622420:157090155:3,157092935:158107635:3,158108330:158594830:3,158594830:159205040:3,159205040:159214075:3,159214075:159437865:3,159437865:159444815:3,159444815:162351305:3,162352000:162447910:3,162447910:162512545:3,162512545:164365415:3,164365415:164475920:3,164477310:164801875:3,164803265:165132695:3,165132695:166291955:3,166294735:166587330:3,166587330:166592890:3,166592890:167084255:3,167084255:167358085:3,167359475:171863770:3,171863770:171867245:3	DUP	BS_WY5KYSHJ
 chr3	171969410	195966153	NULL	162827473:195616354:3,195616355:195730092:5,195730093:195966153:3	171969410:172147330:3,172252970:172281465:3,172281465:173296860:3,173297555:173729150:3,173729150:175556305:3,175557695:175720325:3,175723105:176335400:3,176344435:184803280:3,184805365:185514960:3,185514960:186474060:3,186474755:186773605:3,186773605:188897525:3,188903085:189042780:3,189043475:189265180:3,189270045:189441710:3,189442405:190349380:3,190352160:190477260:3,190477260:190483515:3,190483515:191922165:3,191922860:192849295:3,192850685:193417110:3,193418500:194543705:3,194544400:195148355:3,195151135:195247045:3,195247045:196596040:3,196596735:198156315:3	DUP	BS_WY5KYSHJ
 chr1	18423247	18601943	18422714:18428084:NA,18427821:18601943:NA	18423247:18602345:3	NULL	DUP	BS_X1DSETVE
-chr1	149743647	207527447	147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156050650:NA,154283619:156053365:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA	149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3	NULL	DUP	BS_X1DSETVE
+chr1	149743647	207527447	147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156053365:NA,154283619:156050650:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA	149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3	NULL	DUP	BS_X1DSETVE
 chr2	162345760	162382530	162345760:162382854:NA	162345716:162382530:3	NULL	DUP	BS_X1DSETVE
 chr20	15789213	15802607	15789213:15803019:NA	15788678:15802607:3	NULL	DUP	BS_X1DSETVE
 chr20	38794600	38805566	38794600:38805943:NA	38791637:38805566:3	NULL	DUP	BS_X1DSETVE

From e4c19d8d1da998be9f76159c265729e8fcd937e6 Mon Sep 17 00:00:00 2001
From: Nhat Duong <h2obuffalo97@gmail.com>
Date: Mon, 13 Jan 2020 16:00:04 -0500
Subject: [PATCH 5/6] Update analyses/copy_number_consensus_call/Snakefile

Co-Authored-By: jashapiro <josh.shapiro@ccdatalab.org>
---
 analyses/copy_number_consensus_call/Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile
index e642557846..902b49c1ac 100644
--- a/analyses/copy_number_consensus_call/Snakefile
+++ b/analyses/copy_number_consensus_call/Snakefile
@@ -202,4 +202,5 @@ rule clean_output:
     output:
         "results/cnv_consensus.tsv"
     shell:
-        "python3 {input.script} --file {input.cnv_file} > {output}"
\ No newline at end of file
+        "python3 {input.script} --file {input.cnv_file} > {output}"
+        

From 20f32783e4b1b1d151924c05da31ec9c9a99f1f3 Mon Sep 17 00:00:00 2001
From: Duong <duongn@a45e60e02445.chop.edu>
Date: Mon, 13 Jan 2020 16:02:22 -0500
Subject: [PATCH 6/6] removed get_rid_badsegment.py

---
 .../src/scripts/get_rid_bad_segments.py       | 126 ------------------
 1 file changed, 126 deletions(-)
 delete mode 100644 analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py

diff --git a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py b/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py
deleted file mode 100644
index f6f77aecb0..0000000000
--- a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py
+++ /dev/null
@@ -1,126 +0,0 @@
-
-################## PURPOSE #################
-# This script is to get rid of unwanted/trouble regions like telomeric, centromeric, and known seg-dups regions
-# This script is to take in 2 files
-#      1st: A reference/list of bad segments that contains the bad segments and their start/end positions
-#      2nd: A list of CNVs that need the bad segments to be filtered out
-
-################# ASSUMPTION ###############
-# It is assumed that the reference file DO NOT have overlapping telomeric/centromeric/segments
-# The provided reference file "bad_chromosomal_seg_updated_merged.txt" DOES NOT have
-#   overlaping segments - thus the assumption is not violated.
-# If the use of a different reference file is desired, make sure to merge the file first before using this script.
-
-############################################
-
-# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports
-# Standard library
-import argparse
-import subprocess
-import sys
-import os
-import re
-
-parser = argparse.ArgumentParser(description="""This script filters out telomeric,
-                                             centromeric, and seg-dup regions""")
-parser.add_argument('--reference', required=True,
-                    help='path to the reference list of bad segments to be filtered out for')
-parser.add_argument('--file', required=True,
-                    help='path to the CNVs file that needs to be filtered')
-
-args = parser.parse_args()
-
-#querie_file = sys.argv[2]
-
-#db_file = sys.argv[1]    #bad segment list
-
-
-## Check to see if the CNVs file is empty, if so, make an empty variable
-if os.stat(args.file).st_size == 0:
-    final_content = []
-
-## If the file has content, then load in the content
-else:
-    with open (args.file) as file:
-
-        ## Strip the trailing spaces for each line
-        stripped_content = [i.rstrip() for i in file.readlines()]
-
-        ## Split each line up by any white space between columns
-        final_content = [re.split('\s+',i) for i in stripped_content]
-
-        ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr'
-        if final_content[0][0].find('chr') == -1:
-            for c,chromo in enumerate(final_content):
-                final_content[c][0] = 'chr' + final_content[c][0]
-
-
-
-## Works with the assumption that the segments in the file on the next line DO NOT OVERLAP
-## Read in the reference file
-with open(args.reference) as file:
-
-    ## Strip the trailing spaces for each line
-    reference_stripped = [i.rstrip() for i in file.readlines()]
-
-    ## Split each line up by any white space between columns
-    fin_reference_file = [re.split('\s+',i) for i in reference_stripped]
-
-    ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr'
-    if fin_reference_file[0][0].find('chr') == -1:
-        for c,chromo in enumerate(fin_reference_file):
-            fin_reference_file[c][0] = 'chr' + fin_reference_file[c][0]
-
-
-## Define a variable to hold the final results
-fin_list = []
-
-## Loop through the CNVs file.
-## If any segment in the reference file overlap >50%, then get rid of that CNV
-for i in final_content:
-
-    ## Initiate a variable for the total overlapping
-    overlap = 0
-
-    ## Get the start/end coordinates of the cnv
-    start_cnv = int(i[1])
-    end_cnv = int(i[2])
-
-    ## Make sure end > start (end downstream from start)
-    if start_cnv > end_cnv:
-        start_cnv, end_cnv = end_cnv, start_cnv
-
-    ## For each CNV, loop through the entire reference file
-    for j in fin_reference_file:
-        start_ref = int(j[1])
-        end_ref = int(j[2])
-
-        ## Make sure end > start
-        if start_ref > end_ref:
-            start_ref, end_ref = end_ref, start_ref
-
-        ## If the chromosome number matches and the CNV segment overlaps with the reference segment
-        if i[0] == j[0] and start_cnv <= end_ref and end_cnv >= start_ref:
-
-            ## Get the overlapping coordinate
-            start = max(start_cnv,start_ref)
-            end = min(end_cnv,end_ref)
-
-            ## Calculate the overlapping ratio
-            ## If the CNV overlaps with many segments in the reference file
-            ## We add the overlappting ratios -> This works due to the assumption on line 8
-            overlap += (end - start + 1 )/(end_cnv-start_cnv + 1)
-
-
-    ## If the overall overlapping ratio is smaller than 50%, we add the CNVs to a final list
-    if overlap <= 0.5:
-        fin_list = fin_list + [i]
-
-
-## Print the list to STDOUT -> Python print() function prints to STDOUT by default.
-try:
-    for k in fin_list:
-        print('\t'.join(k))
-    sys.stderr.write('$$$ Filtering was successful\n')
-except:
-    sys.stderr.write('!!! Filtering was not successful\n')