From 3f55855590fd7f1afc352335143acba001683a29 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 18 Dec 2019 03:51:55 -0500 Subject: [PATCH 1/6] add to Snakefile --- analyses/copy_number_consensus_call/Snakefile | 39 ++++++++++- .../src/scripts/restructure_column.py | 67 +++++++++++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 analyses/copy_number_consensus_call/src/scripts/restructure_column.py diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index de1eda7075..d19fee374b 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -1,6 +1,3 @@ -# Nhat Duong -- -# Updated Dec 5, 2019 - ## Define the ending file(s) that we want OUTPUT= expand("../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed", sample=config["samples"], @@ -102,6 +99,7 @@ rule filter_telomeres: ## Define the output files' names filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed" wildcard_constraints: + ## Define the wildcards to use in the file names. caller = "cnvkit|freec|manta", dupdel = "del|dup" threads: 1 @@ -109,3 +107,38 @@ rule filter_telomeres: ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}" + +rule first_merge: + input: + ## Define the location of the input file and take the path/extension from the config file + filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed" + output: + ## Define the output files' names + merged_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered2.bed" + wildcard_constraints: + ## Define the wildcards to use in the file names. + caller = "cnvkit|freec|manta", + dupdel = "del|dup" + threads: 1 + shell: + ## Call on bedtools to merge any overlapping segment. Merging done for any segments within a single file. + ## Any condiders any segments within 10,000 bp to be the same CNV. + ## Merge but retain info from columns 2 (start pos), 3(end pos), 5(copy numbers), 7(CNV type) + "bedtools merge -i {input.filtered_bed} -d 10000 -c 2,3,5,7 -o collapse,collapse,collapse,distinct > {output.merged_bed}" + + +rule restructure_column: + input: + ## Define the location of the input file and take the path/extension from the config file + script=os.path.join(config["scripts"], "restructure_column.py"), + merged_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered2.bed" + output: + ## Define the output files' names + restructured_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered3.bed" + wildcard_constraints: + ## Define the wildcards to use in the file names. + caller = "cnvkit|freec|manta", + dupdel = "del|dup" + threads: 1 + shell: + "python3 {input.script} --file {input.merged_bed} > {output.restructured_bed}" diff --git a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py new file mode 100644 index 0000000000..b598f0b143 --- /dev/null +++ b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py @@ -0,0 +1,67 @@ +################## PURPOSE ################# +# This script is to restructure the 4th(start pos),5th(end pos), and 6th(copy number) columns of the input file +# Because these columns hold the "raw" coordinates of the CNV after filtering out for IGLL, telomerics, centromeric +# and segup regions, we want to retain these info for any analysis further down stream. +# Thus this script format columns 4, 5, and 6 of a CNV into --- start_pos:end_pos:copy_number ---- +# So if a CNV is part of a FINAL consensus CNV, its (start_pos:end_pos:copy_number) information will +# be carried through and contained in the FINAL FILE + +# For example +# column 4 column 5 column 6 +# 2777350,6035598,6807639 6035597,6807638,7659545 3,3,3 + +# Will become 2777350:6035597:3,6035598:6807638:3,6807639:7659545:3, +################# ASSUMPTION ############### + +# The bed file has no header + +############################################ + +# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports +# Standard library +import argparse +import sys +import os + +# Related third party +import numpy as np +import pandas as pd + +## Define parser for the input file +parser = argparse.ArgumentParser(description="""This script restructure the 4th, 5th, and 6th columns""") +parser.add_argument('--file', required=True, + help='path to the bed file that needs to be restructred') + +args = parser.parse_args() + +## Use pandas to read in the bed file. +## Since Manta doesn't have copy numbers and "NA" is used for Manta's copy numbers, we need +## the "keep_default_na" so that the "NA"s don't get converted into NaN by pandas +file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False) + +## Make a new column to store info in the new format (start_pos:end_pos:copy_number) +new_column = file.iloc[:,(0)].copy() + +## For every CNV in the file, reformatt columns 4, 5, and 6 +for i_index,i in enumerate(file.itertuples()): + ## Since bed file uses a comma as default for collapsing information, we split the information using ',' + list_start = i._4.split(',') + list_end = i._5.split(',') + list_cn = str(i._6).split(',') + + ## Initialize a variable to store the new restructred information + growing_list = '' + + ## Rearange the split information into the new format + for j in range(0,len(list_start)): + growing_list += '{}:{}:{},'.format(list_start[j],list_end[j],list_cn[j]) + + ## Add the list to the new column + new_column.iloc[i_index] = growing_list + + +## Make a new file by combining columns 1,2,3, the new column, and the copy number type column. +new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1)) + +## Output restructred file to stdout +new_mat.to_csv(sys.stdout, sep='\t', index=False, header=False) From 3a20aa0690c9fdf60c316037aba0d7b416123ba4 Mon Sep 17 00:00:00 2001 From: Duong Date: Sat, 4 Jan 2020 13:04:44 -0500 Subject: [PATCH 2/6] updating fork --- .../src/scripts/restructure_column.py | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py index b598f0b143..f848168315 100644 --- a/analyses/copy_number_consensus_call/src/scripts/restructure_column.py +++ b/analyses/copy_number_consensus_call/src/scripts/restructure_column.py @@ -34,34 +34,37 @@ args = parser.parse_args() + +## Check to see if the bed file is empty, if so, make an empty dataframe +if os.stat(args.file).st_size == 0: + new_mat = pd.DataFrame(columns=['a','b']) + ## Use pandas to read in the bed file. ## Since Manta doesn't have copy numbers and "NA" is used for Manta's copy numbers, we need ## the "keep_default_na" so that the "NA"s don't get converted into NaN by pandas -file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False) +else: + file = pd.read_csv(args.file, delimiter='\t', header=None, keep_default_na=False) -## Make a new column to store info in the new format (start_pos:end_pos:copy_number) -new_column = file.iloc[:,(0)].copy() + ## Make a new column to store info in the new format (start_pos:end_pos:copy_number) + new_column = file.iloc[:,(0)].copy() -## For every CNV in the file, reformatt columns 4, 5, and 6 -for i_index,i in enumerate(file.itertuples()): - ## Since bed file uses a comma as default for collapsing information, we split the information using ',' - list_start = i._4.split(',') - list_end = i._5.split(',') - list_cn = str(i._6).split(',') + ## For every CNV in the file, reformat columns 4, 5, and 6 + for i_index,i in enumerate(file.itertuples()): + ## Since bed file uses a comma as default for collapsing information, we split the information using ',' + list_start = str(i._4).split(',') + list_end = str(i._5).split(',') + list_cn = str(i._6).split(',') - ## Initialize a variable to store the new restructred information - growing_list = '' + ## Rearange the split information into the new format + cnv_list = ''.join(['{}:{}:{},'.format(*cnv) for cnv in zip(list_start, list_end, list_cn)]) - ## Rearange the split information into the new format - for j in range(0,len(list_start)): - growing_list += '{}:{}:{},'.format(list_start[j],list_end[j],list_cn[j]) - ## Add the list to the new column - new_column.iloc[i_index] = growing_list + ## Add the list to the new column + new_column.iloc[i_index] = cnv_list -## Make a new file by combining columns 1,2,3, the new column, and the copy number type column. -new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1)) + ## Make a new file by combining columns 1,2,3, the new column, and the copy number type column. + new_mat = (pd.concat([file.iloc[:,0:3].copy(),new_column,file.iloc[:,-1].copy()], axis=1)) ## Output restructred file to stdout new_mat.to_csv(sys.stdout, sep='\t', index=False, header=False) From 305bbbf3f428a538816e0e4744f6b34faff9cc35 Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 6 Jan 2020 10:01:10 -0500 Subject: [PATCH 3/6] changed output path and name --- analyses/copy_number_consensus_call/Snakefile | 52 +++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index f19d088683..1aea22cd0a 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -1,8 +1,6 @@ ## Define the ending file(s) that we want -OUTPUT= expand("../../scratch/interim/{sample}.{combined_caller}.{dupdel}.bed", - sample=config["samples"], - combined_caller=["manta_cnvkit", "manta_freec", "cnvkit_freec"], - dupdel=["dup", "del"]), +OUTPUT= expand("results/cnv_consensus/{sample}.cnv_consensus.csv", + sample=config["samples"]), ## Define the wildcards to use in the file names. wildcard_constraints: @@ -161,4 +159,50 @@ rule compare_cnv_methods: "--sample {params.sample_name}" +rule combine_files: + input: + ## Define the location of the input file + manta_cnvkit="../../scratch/interim/{sample}.manta_cnvkit.{dupdel}.bed", + manta_freec="../../scratch/interim/{sample}.manta_freec.{dupdel}.bed", + cnvkit_freec="../../scratch/interim/{sample}.cnvkit_freec.{dupdel}.bed" + output: + ## Define the output files' names + dup_del_file="../../scratch/interim/{sample}.combined.{dupdel}.sorted.bed" + threads: 1 + shell: + ## Combine the input file, sort and output to one file + "cat {input.manta_cnvkit} {input.manta_freec} {input.cnvkit_freec} | sort -k1,1 -k2,2n > {output.dup_del_file}" + + +rule merge: + input: + ## Define the location of the input file + delete="../../scratch/interim/{sample}.combined.del.sorted.bed", + dup="../../scratch/interim/{sample}.combined.dup.sorted.bed" + output: + ## Define the output files' names + delete="../../scratch/interim/{sample}.del.merged.bed", + dup="../../scratch/interim/{sample}.dup.merged.bed", + combined_dup_del="../../scratch/endpoints/{sample}.del_dup.merged.bed" + threads: 1 + shell: + ## Merge the consensus files together + ## Put the dup CNVs calls and del CNVs calls together into one file + "bedtools merge -i {input.delete} -c 4,5,6,7,8,9 -o collapse,collapse,collapse,distinct,distinct,collapse > {output.delete} && \ + bedtools merge -i {input.dup} -c 4,5,6,7,8,9 -o collapse,collapse,collapse,distinct,distinct,collapse > {output.dup} && \ + cat {output.delete} {output.dup} > {output.combined_dup_del}"c + + +rule add_column_names: + input: + ## Define the location of the input file and take the path/extension from the config file + script=os.path.join(config["scripts"], "give_column_names.py"), + combined_dup_del="../../scratch/endpoints/{sample}.del_dup.merged.bed" + output: + ## Define the output files' names + final_dup_del="results/cnv_consensus/{sample}.cnv_consensus.csv" + threads: 1 + shell: + ## Name the headers + "python3 {input.script} --file {input.combined_dup_del} > {output.final_dup_del}" From 8f3e136ee320626f78b2cf61cc24d81a9eed6e26 Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 13 Jan 2020 12:10:54 -0500 Subject: [PATCH 4/6] add bedtools subtract and new result file --- analyses/copy_number_consensus_call/Snakefile | 7 +++---- .../copy_number_consensus_call/results/cnv_consensus.tsv | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index a50eb0c06b..e642557846 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -94,7 +94,6 @@ rule manta_filter: rule filter_telomeres: input: ## Define the location of the input file and take the path/extension from the config file - script=os.path.join(config["scripts"], "get_rid_bad_segments.py"), bad_list=os.path.join(config["scripts"], "bad_chromosomal_seg_updated_merged.bed"), bedfile="../../scratch/interim/{sample}.{caller}.{dupdel}.bed" output: @@ -102,8 +101,8 @@ rule filter_telomeres: filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed" threads: 1 shell: - ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. - "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}" + ## Invoke the bedtools subtract pass in the reference and CNVs files. Direct the stdout to a new file. + "bedtools subtract -N -a {input.bedfile} -b {input.bad_list} -f 0.5 > {output.filtered_bed}" rule first_merge: @@ -203,4 +202,4 @@ rule clean_output: output: "results/cnv_consensus.tsv" shell: - "python3 {input.script} --file {input.cnv_file} > {output}" + "python3 {input.script} --file {input.cnv_file} > {output}" \ No newline at end of file diff --git a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv index f46a941b3c..8f82c6f4df 100644 --- a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv +++ b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv @@ -3167,7 +3167,7 @@ chr3 84952630 87955725 NULL 84950794:86911163:3,86911164:88297478:3 84952630:861 chr3 93649137 162808158 NULL 93649137:144671157:3,144671158:148925621:3,148925622:162808158:3 93512945:94627725:3,94627725:94631200:3,94631200:94770200:3,94770895:96420825:3,96421520:96702995:3,96702995:96706470:3,96706470:98244505:3,98244505:98249370:3,98249370:99072250:3,99072250:99227235:3,99227235:99343995:3,99343995:99347470:3,99347470:100430280:3,100430280:100435145:3,100435145:100795850:3,100795850:100827820:3,100827820:102084380:3,102085075:102991355:3,102991355:103832305:3,103833000:103955320:3,103955320:103981730:3,103981730:104394560:3,104395950:104569005:3,104569005:105774830:3,105775525:106378090:3,106378785:108032885:3,108034970:108318530:3,108319225:109156005:3,109156005:109263730:3,109266510:110058810:3,110058810:110062980:3,110062980:110371560:3,110371560:110794815:3,110796900:111147875:3,111147875:111152740:3,111152740:111361240:3,111361240:111872760:3,111872760:112468375:3,112471155:112716490:3,112717880:114292055:3,114292055:115048910:3,115048910:115053775:3,115053775:116300605:3,116301995:116589030:3,116589725:118575340:3,118575340:118686540:3,118689320:119395440:3,119395440:119533745:3,119535135:120095305:3,120095305:120916795:3,120917490:121556195:3,121556195:121561060:3,121561060:123653705:3,123653705:123659265:3,123659265:124059585:3,124061670:124968645:3,124968645:125040230:3,125040925:125204250:3,125204250:125414835:3,125414835:125418310:3,125418310:125664340:3,125871450:126530310:3,126530310:126677650:3,126677650:126696415:3,126696415:126969550:3,126970245:127350410:3,127350410:129039260:3,129041345:129350620:3,129357570:129583445:3,129584140:130044925:3,130079675:130387560:3,130388950:134391455:3,134391455:134415085:3,134415085:135905165:3,135905165:136818395:3,136818395:136947665:3,136947665:136952530:3,136952530:137153385:3,137153385:137185355:3,137185355:137791395:3,137792090:138124995:3,138125690:138409945:3,138411335:139671370:3,139672065:139769365:3,139769365:141656290:3,141656290:142283875:3,142283875:142694620:3,142694620:142698790:3,142698790:144671895:3,144671895:145747755:3,145750535:146042435:3,146042435:146230780:3,146232170:146367695:3,146367695:148630615:3,148630615:148637565:3,148638260:148773090:3,148773090:153836165:3,153836860:154109995:3,154116945:156622420:3,156622420:157090155:3,157092935:158107635:3,158108330:158594830:3,158594830:159205040:3,159205040:159214075:3,159214075:159437865:3,159437865:159444815:3,159444815:162351305:3,162352000:162447910:3,162447910:162512545:3,162512545:164365415:3,164365415:164475920:3,164477310:164801875:3,164803265:165132695:3,165132695:166291955:3,166294735:166587330:3,166587330:166592890:3,166592890:167084255:3,167084255:167358085:3,167359475:171863770:3,171863770:171867245:3 DUP BS_WY5KYSHJ chr3 171969410 195966153 NULL 162827473:195616354:3,195616355:195730092:5,195730093:195966153:3 171969410:172147330:3,172252970:172281465:3,172281465:173296860:3,173297555:173729150:3,173729150:175556305:3,175557695:175720325:3,175723105:176335400:3,176344435:184803280:3,184805365:185514960:3,185514960:186474060:3,186474755:186773605:3,186773605:188897525:3,188903085:189042780:3,189043475:189265180:3,189270045:189441710:3,189442405:190349380:3,190352160:190477260:3,190477260:190483515:3,190483515:191922165:3,191922860:192849295:3,192850685:193417110:3,193418500:194543705:3,194544400:195148355:3,195151135:195247045:3,195247045:196596040:3,196596735:198156315:3 DUP BS_WY5KYSHJ chr1 18423247 18601943 18422714:18428084:NA,18427821:18601943:NA 18423247:18602345:3 NULL DUP BS_X1DSETVE -chr1 149743647 207527447 147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156050650:NA,154283619:156053365:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA 149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3 NULL DUP BS_X1DSETVE +chr1 149743647 207527447 147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156053365:NA,154283619:156050650:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA 149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3 NULL DUP BS_X1DSETVE chr2 162345760 162382530 162345760:162382854:NA 162345716:162382530:3 NULL DUP BS_X1DSETVE chr20 15789213 15802607 15789213:15803019:NA 15788678:15802607:3 NULL DUP BS_X1DSETVE chr20 38794600 38805566 38794600:38805943:NA 38791637:38805566:3 NULL DUP BS_X1DSETVE From e4c19d8d1da998be9f76159c265729e8fcd937e6 Mon Sep 17 00:00:00 2001 From: Nhat Duong Date: Mon, 13 Jan 2020 16:00:04 -0500 Subject: [PATCH 5/6] Update analyses/copy_number_consensus_call/Snakefile Co-Authored-By: jashapiro --- analyses/copy_number_consensus_call/Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index e642557846..902b49c1ac 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -202,4 +202,5 @@ rule clean_output: output: "results/cnv_consensus.tsv" shell: - "python3 {input.script} --file {input.cnv_file} > {output}" \ No newline at end of file + "python3 {input.script} --file {input.cnv_file} > {output}" + From 20f32783e4b1b1d151924c05da31ec9c9a99f1f3 Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 13 Jan 2020 16:02:22 -0500 Subject: [PATCH 6/6] removed get_rid_badsegment.py --- .../src/scripts/get_rid_bad_segments.py | 126 ------------------ 1 file changed, 126 deletions(-) delete mode 100644 analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py diff --git a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py b/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py deleted file mode 100644 index f6f77aecb0..0000000000 --- a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py +++ /dev/null @@ -1,126 +0,0 @@ - -################## PURPOSE ################# -# This script is to get rid of unwanted/trouble regions like telomeric, centromeric, and known seg-dups regions -# This script is to take in 2 files -# 1st: A reference/list of bad segments that contains the bad segments and their start/end positions -# 2nd: A list of CNVs that need the bad segments to be filtered out - -################# ASSUMPTION ############### -# It is assumed that the reference file DO NOT have overlapping telomeric/centromeric/segments -# The provided reference file "bad_chromosomal_seg_updated_merged.txt" DOES NOT have -# overlaping segments - thus the assumption is not violated. -# If the use of a different reference file is desired, make sure to merge the file first before using this script. - -############################################ - -# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports -# Standard library -import argparse -import subprocess -import sys -import os -import re - -parser = argparse.ArgumentParser(description="""This script filters out telomeric, - centromeric, and seg-dup regions""") -parser.add_argument('--reference', required=True, - help='path to the reference list of bad segments to be filtered out for') -parser.add_argument('--file', required=True, - help='path to the CNVs file that needs to be filtered') - -args = parser.parse_args() - -#querie_file = sys.argv[2] - -#db_file = sys.argv[1] #bad segment list - - -## Check to see if the CNVs file is empty, if so, make an empty variable -if os.stat(args.file).st_size == 0: - final_content = [] - -## If the file has content, then load in the content -else: - with open (args.file) as file: - - ## Strip the trailing spaces for each line - stripped_content = [i.rstrip() for i in file.readlines()] - - ## Split each line up by any white space between columns - final_content = [re.split('\s+',i) for i in stripped_content] - - ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr' - if final_content[0][0].find('chr') == -1: - for c,chromo in enumerate(final_content): - final_content[c][0] = 'chr' + final_content[c][0] - - - -## Works with the assumption that the segments in the file on the next line DO NOT OVERLAP -## Read in the reference file -with open(args.reference) as file: - - ## Strip the trailing spaces for each line - reference_stripped = [i.rstrip() for i in file.readlines()] - - ## Split each line up by any white space between columns - fin_reference_file = [re.split('\s+',i) for i in reference_stripped] - - ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr' - if fin_reference_file[0][0].find('chr') == -1: - for c,chromo in enumerate(fin_reference_file): - fin_reference_file[c][0] = 'chr' + fin_reference_file[c][0] - - -## Define a variable to hold the final results -fin_list = [] - -## Loop through the CNVs file. -## If any segment in the reference file overlap >50%, then get rid of that CNV -for i in final_content: - - ## Initiate a variable for the total overlapping - overlap = 0 - - ## Get the start/end coordinates of the cnv - start_cnv = int(i[1]) - end_cnv = int(i[2]) - - ## Make sure end > start (end downstream from start) - if start_cnv > end_cnv: - start_cnv, end_cnv = end_cnv, start_cnv - - ## For each CNV, loop through the entire reference file - for j in fin_reference_file: - start_ref = int(j[1]) - end_ref = int(j[2]) - - ## Make sure end > start - if start_ref > end_ref: - start_ref, end_ref = end_ref, start_ref - - ## If the chromosome number matches and the CNV segment overlaps with the reference segment - if i[0] == j[0] and start_cnv <= end_ref and end_cnv >= start_ref: - - ## Get the overlapping coordinate - start = max(start_cnv,start_ref) - end = min(end_cnv,end_ref) - - ## Calculate the overlapping ratio - ## If the CNV overlaps with many segments in the reference file - ## We add the overlappting ratios -> This works due to the assumption on line 8 - overlap += (end - start + 1 )/(end_cnv-start_cnv + 1) - - - ## If the overall overlapping ratio is smaller than 50%, we add the CNVs to a final list - if overlap <= 0.5: - fin_list = fin_list + [i] - - -## Print the list to STDOUT -> Python print() function prints to STDOUT by default. -try: - for k in fin_list: - print('\t'.join(k)) - sys.stderr.write('$$$ Filtering was successful\n') -except: - sys.stderr.write('!!! Filtering was not successful\n')