diff --git a/analyses/copy_number_consensus_call/Snakefile b/analyses/copy_number_consensus_call/Snakefile index a50eb0c06b..902b49c1ac 100644 --- a/analyses/copy_number_consensus_call/Snakefile +++ b/analyses/copy_number_consensus_call/Snakefile @@ -94,7 +94,6 @@ rule manta_filter: rule filter_telomeres: input: ## Define the location of the input file and take the path/extension from the config file - script=os.path.join(config["scripts"], "get_rid_bad_segments.py"), bad_list=os.path.join(config["scripts"], "bad_chromosomal_seg_updated_merged.bed"), bedfile="../../scratch/interim/{sample}.{caller}.{dupdel}.bed" output: @@ -102,8 +101,8 @@ rule filter_telomeres: filtered_bed="../../scratch/interim/{sample}.{caller}.{dupdel}.filtered.bed" threads: 1 shell: - ## Invoke the python3 script and pass in the reference and CNVs files. Direct the stdout to a new file. - "python3 {input.script} --reference {input.bad_list} --file {input.bedfile} > {output.filtered_bed}" + ## Invoke the bedtools subtract pass in the reference and CNVs files. Direct the stdout to a new file. + "bedtools subtract -N -a {input.bedfile} -b {input.bad_list} -f 0.5 > {output.filtered_bed}" rule first_merge: @@ -204,3 +203,4 @@ rule clean_output: "results/cnv_consensus.tsv" shell: "python3 {input.script} --file {input.cnv_file} > {output}" + diff --git a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv index f46a941b3c..8f82c6f4df 100644 --- a/analyses/copy_number_consensus_call/results/cnv_consensus.tsv +++ b/analyses/copy_number_consensus_call/results/cnv_consensus.tsv @@ -3167,7 +3167,7 @@ chr3 84952630 87955725 NULL 84950794:86911163:3,86911164:88297478:3 84952630:861 chr3 93649137 162808158 NULL 93649137:144671157:3,144671158:148925621:3,148925622:162808158:3 93512945:94627725:3,94627725:94631200:3,94631200:94770200:3,94770895:96420825:3,96421520:96702995:3,96702995:96706470:3,96706470:98244505:3,98244505:98249370:3,98249370:99072250:3,99072250:99227235:3,99227235:99343995:3,99343995:99347470:3,99347470:100430280:3,100430280:100435145:3,100435145:100795850:3,100795850:100827820:3,100827820:102084380:3,102085075:102991355:3,102991355:103832305:3,103833000:103955320:3,103955320:103981730:3,103981730:104394560:3,104395950:104569005:3,104569005:105774830:3,105775525:106378090:3,106378785:108032885:3,108034970:108318530:3,108319225:109156005:3,109156005:109263730:3,109266510:110058810:3,110058810:110062980:3,110062980:110371560:3,110371560:110794815:3,110796900:111147875:3,111147875:111152740:3,111152740:111361240:3,111361240:111872760:3,111872760:112468375:3,112471155:112716490:3,112717880:114292055:3,114292055:115048910:3,115048910:115053775:3,115053775:116300605:3,116301995:116589030:3,116589725:118575340:3,118575340:118686540:3,118689320:119395440:3,119395440:119533745:3,119535135:120095305:3,120095305:120916795:3,120917490:121556195:3,121556195:121561060:3,121561060:123653705:3,123653705:123659265:3,123659265:124059585:3,124061670:124968645:3,124968645:125040230:3,125040925:125204250:3,125204250:125414835:3,125414835:125418310:3,125418310:125664340:3,125871450:126530310:3,126530310:126677650:3,126677650:126696415:3,126696415:126969550:3,126970245:127350410:3,127350410:129039260:3,129041345:129350620:3,129357570:129583445:3,129584140:130044925:3,130079675:130387560:3,130388950:134391455:3,134391455:134415085:3,134415085:135905165:3,135905165:136818395:3,136818395:136947665:3,136947665:136952530:3,136952530:137153385:3,137153385:137185355:3,137185355:137791395:3,137792090:138124995:3,138125690:138409945:3,138411335:139671370:3,139672065:139769365:3,139769365:141656290:3,141656290:142283875:3,142283875:142694620:3,142694620:142698790:3,142698790:144671895:3,144671895:145747755:3,145750535:146042435:3,146042435:146230780:3,146232170:146367695:3,146367695:148630615:3,148630615:148637565:3,148638260:148773090:3,148773090:153836165:3,153836860:154109995:3,154116945:156622420:3,156622420:157090155:3,157092935:158107635:3,158108330:158594830:3,158594830:159205040:3,159205040:159214075:3,159214075:159437865:3,159437865:159444815:3,159444815:162351305:3,162352000:162447910:3,162447910:162512545:3,162512545:164365415:3,164365415:164475920:3,164477310:164801875:3,164803265:165132695:3,165132695:166291955:3,166294735:166587330:3,166587330:166592890:3,166592890:167084255:3,167084255:167358085:3,167359475:171863770:3,171863770:171867245:3 DUP BS_WY5KYSHJ chr3 171969410 195966153 NULL 162827473:195616354:3,195616355:195730092:5,195730093:195966153:3 171969410:172147330:3,172252970:172281465:3,172281465:173296860:3,173297555:173729150:3,173729150:175556305:3,175557695:175720325:3,175723105:176335400:3,176344435:184803280:3,184805365:185514960:3,185514960:186474060:3,186474755:186773605:3,186773605:188897525:3,188903085:189042780:3,189043475:189265180:3,189270045:189441710:3,189442405:190349380:3,190352160:190477260:3,190477260:190483515:3,190483515:191922165:3,191922860:192849295:3,192850685:193417110:3,193418500:194543705:3,194544400:195148355:3,195151135:195247045:3,195247045:196596040:3,196596735:198156315:3 DUP BS_WY5KYSHJ chr1 18423247 18601943 18422714:18428084:NA,18427821:18601943:NA 18423247:18602345:3 NULL DUP BS_X1DSETVE -chr1 149743647 207527447 147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156050650:NA,154283619:156053365:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA 149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3 NULL DUP BS_X1DSETVE +chr1 149743647 207527447 147699786:224374617:NA,151192918:224378395:NA,154279508:156053291:NA,154283619:156053365:NA,154283619:156050650:NA,156051660:156078863:NA,186958658:189414983:NA,223721204:223740662:NA 149743647:151178440:3,151178441:151392366:3,151392367:152147573:3,152147574:152217223:3,152217224:152351549:3,152351550:152361499:3,152361500:153302771:3,154767417:155000246:3,155079848:155216162:3,155662919:169254659:3,169277545:206163298:3,206409065:207527447:3 NULL DUP BS_X1DSETVE chr2 162345760 162382530 162345760:162382854:NA 162345716:162382530:3 NULL DUP BS_X1DSETVE chr20 15789213 15802607 15789213:15803019:NA 15788678:15802607:3 NULL DUP BS_X1DSETVE chr20 38794600 38805566 38794600:38805943:NA 38791637:38805566:3 NULL DUP BS_X1DSETVE diff --git a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py b/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py deleted file mode 100644 index f6f77aecb0..0000000000 --- a/analyses/copy_number_consensus_call/src/scripts/get_rid_bad_segments.py +++ /dev/null @@ -1,126 +0,0 @@ - -################## PURPOSE ################# -# This script is to get rid of unwanted/trouble regions like telomeric, centromeric, and known seg-dups regions -# This script is to take in 2 files -# 1st: A reference/list of bad segments that contains the bad segments and their start/end positions -# 2nd: A list of CNVs that need the bad segments to be filtered out - -################# ASSUMPTION ############### -# It is assumed that the reference file DO NOT have overlapping telomeric/centromeric/segments -# The provided reference file "bad_chromosomal_seg_updated_merged.txt" DOES NOT have -# overlaping segments - thus the assumption is not violated. -# If the use of a different reference file is desired, make sure to merge the file first before using this script. - -############################################ - -# Imports in the pep8 order https://www.python.org/dev/peps/pep-0008/#imports -# Standard library -import argparse -import subprocess -import sys -import os -import re - -parser = argparse.ArgumentParser(description="""This script filters out telomeric, - centromeric, and seg-dup regions""") -parser.add_argument('--reference', required=True, - help='path to the reference list of bad segments to be filtered out for') -parser.add_argument('--file', required=True, - help='path to the CNVs file that needs to be filtered') - -args = parser.parse_args() - -#querie_file = sys.argv[2] - -#db_file = sys.argv[1] #bad segment list - - -## Check to see if the CNVs file is empty, if so, make an empty variable -if os.stat(args.file).st_size == 0: - final_content = [] - -## If the file has content, then load in the content -else: - with open (args.file) as file: - - ## Strip the trailing spaces for each line - stripped_content = [i.rstrip() for i in file.readlines()] - - ## Split each line up by any white space between columns - final_content = [re.split('\s+',i) for i in stripped_content] - - ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr' - if final_content[0][0].find('chr') == -1: - for c,chromo in enumerate(final_content): - final_content[c][0] = 'chr' + final_content[c][0] - - - -## Works with the assumption that the segments in the file on the next line DO NOT OVERLAP -## Read in the reference file -with open(args.reference) as file: - - ## Strip the trailing spaces for each line - reference_stripped = [i.rstrip() for i in file.readlines()] - - ## Split each line up by any white space between columns - fin_reference_file = [re.split('\s+',i) for i in reference_stripped] - - ## If the 1st column has '1' instead of 'chr1' for the chromosome number, add in 'chr' - if fin_reference_file[0][0].find('chr') == -1: - for c,chromo in enumerate(fin_reference_file): - fin_reference_file[c][0] = 'chr' + fin_reference_file[c][0] - - -## Define a variable to hold the final results -fin_list = [] - -## Loop through the CNVs file. -## If any segment in the reference file overlap >50%, then get rid of that CNV -for i in final_content: - - ## Initiate a variable for the total overlapping - overlap = 0 - - ## Get the start/end coordinates of the cnv - start_cnv = int(i[1]) - end_cnv = int(i[2]) - - ## Make sure end > start (end downstream from start) - if start_cnv > end_cnv: - start_cnv, end_cnv = end_cnv, start_cnv - - ## For each CNV, loop through the entire reference file - for j in fin_reference_file: - start_ref = int(j[1]) - end_ref = int(j[2]) - - ## Make sure end > start - if start_ref > end_ref: - start_ref, end_ref = end_ref, start_ref - - ## If the chromosome number matches and the CNV segment overlaps with the reference segment - if i[0] == j[0] and start_cnv <= end_ref and end_cnv >= start_ref: - - ## Get the overlapping coordinate - start = max(start_cnv,start_ref) - end = min(end_cnv,end_ref) - - ## Calculate the overlapping ratio - ## If the CNV overlaps with many segments in the reference file - ## We add the overlappting ratios -> This works due to the assumption on line 8 - overlap += (end - start + 1 )/(end_cnv-start_cnv + 1) - - - ## If the overall overlapping ratio is smaller than 50%, we add the CNVs to a final list - if overlap <= 0.5: - fin_list = fin_list + [i] - - -## Print the list to STDOUT -> Python print() function prints to STDOUT by default. -try: - for k in fin_list: - print('\t'.join(k)) - sys.stderr.write('$$$ Filtering was successful\n') -except: - sys.stderr.write('!!! Filtering was not successful\n')