forked from bioinform/somaticseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscore_Somatic.Variants.py
executable file
·771 lines (543 loc) · 37.5 KB
/
score_Somatic.Variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
#!/usr/bin/env python3
# Add support for single-sample mode
# Extract more COSMIC features
# AD field now assumes Broad's convention
# 8/30/2015
# Li Tai Fang
import sys, os, argparse, gzip, math, itertools
import regex as re
import genomic_file_handlers as genome
# argparse Stuff
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-indications', '--cancer-indications', type=str, help='Output VCF file', nargs='*', required=False, default=None)
parser.add_argument('-mincaller', '--minimum-callers', type=int, help='Minimum number of callers identify it as a SOMATIC for the site to be emitted', required=False, default=0)
parser.add_argument('-infile', '--input-vcf', type=str, help='Input VCF file', required=True, default=None)
parser.add_argument('-outfile', '--output-vcf', type=str, help='Output VCF file', required=True, default=None)
parser.add_argument('-genes', '--cancer_gene_census', type=str, help='A file where the first column contain list of cancer census genes', required=False)
parser.add_argument('-cntlo' , '--low-threshold-CNT', type=int, help='Lowest threshold for COSMIC +2 scoring', default=3)
parser.add_argument('-cntmed' , '--medium-threshold-CNT', type=int, help='Medium threshold for COSMIC +3 scoring', default=5)
parser.add_argument('-cnthi' , '--high-threshold-CNT', type=int, help='Highest threshold for COSMIC +5 scoring', default=10)
parser.add_argument('-cntex' , '--extreme-threshold-CNT', type=int, help='Highest threshold for COSMIC +6 scoring', default=100)
parser.add_argument('-cntri' , '--ridiculous-threshold-CNT', type=int, help='Highest threshold for COSMIC +7 scoring', default=1000)
parser.add_argument('-caf' , '--high-threshold-CAF', type=float, help='Lowest threshold for common SNP -3 penalty', default=0.01)
parser.add_argument('-MSC', '--minor-strand-count', type=int, help='Minor Strand Count, i.e,. filtering strand bias', default=1)
parser.add_argument('-DP' , '--depth-of-coverage', type=int, help='Total Depth of Coverage', default=15)
parser.add_argument('-ALF', '--alternative-allele-freq', type=float, help='Alternative Allele Frequency', default=0.10)
parser.add_argument('-NAF', '--normal-alternative-freq', type=float, help='Normal Alternative Frequency (Upper Bound)', default=0.05)
parser.add_argument('-N', '--normal-sample-name', type=str, help='Normal Sample Name', required=False, default='NORMAL')
parser.add_argument('-T', '--tumor-sample-name', type=str, help='Tumor Sample Name', required=False, default='TUMOR')
parser.add_argument('-pass_score', '--oncoscore-threshold-pass', type=int, help='Minimum OncoScore that generates a PASS filter', default=5)
parser.add_argument('-lowconf_score', '--oncoscore-threshold-lowconfidence', type=int, help='Minimum OncoScore that generates a PASS filter', default=3)
parser.add_argument('-tools', '--individual-mutation-tools', type=str, help='A list of all tools: have to match the annotated tool name in the input vcf files', nargs='*', required=False, default=('CGA', 'VarScan2', 'JointSNVMix2', 'SomaticSniper', 'VarDict', 'MuSE'))
args = parser.parse_args()
# Rename input arguments:
tools = args.individual_mutation_tools
num_tools = len(tools)
min_tools = args.minimum_callers
assert args.oncoscore_threshold_pass > args.oncoscore_threshold_lowconfidence
# The regular expression pattern for "chrXX 1234567" in both VarScan2 Output and VCF files:
pattern_chr_position = re.compile(r'^(?:chr)?(?:[0-9]+|[XYM]|MT)\t[0-9]+\b')
pattern_COSM = re.compile(r'COS[MN][0-9]+')
pattern_rsID = re.compile(r'rs[0-9]+')
pattern_CAF = re.compile(r'\[[0-9.,]+\]')
# snpEFF's fields
# EFF= Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_Length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] ),
# An example:
#EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cGt/cAt|R229H|246|SLC35E2||CODING|NM_001199787.1|6|1),NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|cGt/cAt|R229H|266|SLC35E2||CODING|NM_182838.2|5|1)'
pattern_EFF = re.compile(r'(EFF=|ANN=)?(?<Effect>[^(]+)\((?<Effect_Impact>[^|]*)\|(?<Functional_Class>[^|]*)\|(?<Codon_Change>[^|]*)\|(?<Amino_Acid_Change>[^|]*)\|(?<Amino_Acid_Length>[^|]*)\|(?<Gene_Name>[^|]*)\|(?<Transcript_BioType>[^|]*)\|(?<Gene_Coding>[^|]*)\|(?<Transcript_ID>[^|]*)\|(?<Exon_Rank>[^|]*)\|(?<Genotype_Number>[^|]*)?.*\)' )
if not args.cancer_indications:
print('ALERT: Cancer indications were empty. None will be scored for it!!!', file=sys.stderr)
# Obtain the cancer gene census set:
cancer_gene_set = {}
if args.cancer_gene_census:
with open(args.cancer_gene_census) as cancer_gene_file:
line_i = cancer_gene_file.readline().rstrip() # Skip first line
line_i = cancer_gene_file.readline().rstrip()
while line_i:
item_i = line_i.split('\t')
# 0th column is gene symbol. 5th column is indications:
cancer_gene_set[ item_i[0] ] = item_i[5]
line_i = cancer_gene_file.readline().rstrip()
# Maximum scores for subEvidence and subKnowledge
max_subEvidence = 9
max_subKnowledge = 9
max_scale = 10
# All possible combination of tools:
MVJS_combinations = {}
idx_combo_total, idx_combo_dbsnp, idx_combo_common, idx_combo_cosmic = 0, 1, 2, 3
for combo_i in itertools.product( (True, False), repeat = len(tools) ):
# The four zeros represent [Total, dbsnp, COMMON, COSMIC]
MVJS_combinations[combo_i] = [0, 0, 0, 0]
# Keeping a tab on all those scores
bina_score_tally = {}
subscore_evidence_tally = {}
bonus_knowledge_tally = {}
penalty_knowledge_tally = {}
num_methods_tally = {}
with genome.open_textfile(args.input_vcf) as vcf, open(args.output_vcf, 'w') as vcf_out:
line_i = vcf.readline().rstrip()
while line_i.startswith('#'):
# Read thru the headers and metadata:
if line_i.startswith('#CHROM'):
header_item = line_i.split('\t')
if len(header_item) == 11:
paired_mode = True
idxN, idxT = 0,1
elif len(header_item) == 10:
paired_mode = False
idxT = 0
else:
raise Exception('The number of samples is neither 1 nor 2')
line_i = vcf.readline().rstrip()
# Write the headers and meta-data into the output vcf file:
vcf_out.write('##fileformat=VCFv4.1\n')
vcf_out.write('##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation with ONCOSCORE at least {}">\n'.format(args.oncoscore_threshold_pass) )
vcf_out.write('##FILTER=<ID=LowQual,Description="Lower confident somatic mutation with ONCOSCORE at least between {} and {}">\n'.format(args.oncoscore_threshold_lowconfidence ,args.oncoscore_threshold_pass-1) )
vcf_out.write('##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation with ONCOSCORE below {}">\n'.format(args.oncoscore_threshold_lowconfidence) )
vcf_out.write('##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Passes as a SOMATIC Variant">\n')
vcf_out.write('##INFO=<ID=COMMON,Number=1,Type=Integer,Description="RS is a common SNP. A common SNP is one that has at least one 1000Genomes population with a minor allele of frequency >= 1% and for which 2 or more founders contribute to that minor allele frequency.">\n')
vcf_out.write('##INFO=<ID=CAF,Number=.,Type=String,Description="An ordered, comma delimited list of allele frequencies based on 1000Genomes, starting with the reference allele followed by alternate alleles as ordered in the ALT column. Where a 1000Genomes alternate allele is not in the dbSNPs alternate allele set, the allele is added to the ALT column. The minor allele is the second largest value in the list, and was previuosly reported in VCF as the GMAF. This is the GMAF reported on the RefSNP and EntrezSNP pages and VariationReporter">\n')
vcf_out.write('##INFO=<ID=G5,Number=0,Type=Flag,Description=">5% minor allele frequency in 1+ populations">\n')
vcf_out.write('##INFO=<ID=G5A,Number=0,Type=Flag,Description=">5% minor allele frequency in each and all populations">\n')
vcf_out.write('##INFO=<ID=AA,Number=1,Type=String,Description="Peptide annotation">\n')
vcf_out.write('##INFO=<ID=CDS,Number=1,Type=String,Description="CDS annotation">\n')
vcf_out.write('##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">\n')
vcf_out.write('##INFO=<ID=MUT,Number=0,Type=Flag,Description="Is mutation (journal citation, explicit fact): a low frequency variation that is cited in journal and other reputable sources">\n')
vcf_out.write('##INFO=<ID=CNT,Number=1,Type=Integer,Description="How many samples have this mutation">\n')
vcf_out.write('##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: \'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype_Number [ | ERRORS | WARNINGS ] )\'">\n')
vcf_out.write('##INFO=<ID=subEvidence,Number=1,Type=Float,Description="subScore based on number of somatic mutation callers">\n')
vcf_out.write('##INFO=<ID=subKnowledge,Number=2,Type=Float,Description="First number is Bonus subScore based on COSMIC and Cancer Gene Census membership plus SIFT prediction. Second number is penalty based on dbSNP membership and COMMON annotation.">\n')
vcf_out.write('##INFO=<ID=SOURCES,Number=.,Type=String,Description="The Somatic Mutation Callers that called it">\n')
vcf_out.write('##INFO=<ID=NUM_SMMETHODS,Number=1,Type=Integer,Description="Identified by number of Somatic Mutation Callers">\n')
vcf_out.write('##INFO=<ID=OncoRK,Number=1,Type=Float,Description="BINA Somatic Mutation Score">\n')
vcf_out.write('##INFO=<ID=LSEQ,Number=1,Type=String,Description="5\' flanking seq">\n')
vcf_out.write('##INFO=<ID=RSEQ,Number=1,Type=String,Description="3\' flanking seq">\n')
vcf_out.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
vcf_out.write('##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">\n')
vcf_out.write('##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)">\n')
vcf_out.write('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n')
vcf_out.write('##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">\n')
vcf_out.write('##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">\n')
vcf_out.write('##FORMAT=<ID=AMQ,Number=.,Type=Integer,Description="Average mapping quality for each allele present in the genotype">\n')
vcf_out.write('##FORMAT=<ID=BQ,Number=.,Type=Float,Description="Average base quality for reads supporting alleles">\n')
vcf_out.write('##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality across all reads">\n')
vcf_out.write('##FORMAT=<ID=VAQ,Number=1,Type=Integer,Description="Variant allele quality">\n')
vcf_out.write('##FORMAT=<ID=SHIFT3,Number=1,Type=Integer,Description="No. of bases to be shifted to 3 prime for deletions due to alternative alignment">\n')
vcf_out.write('##FORMAT=<ID=MSI,Number=1,Type=Float,Description="MicroSattelite. > 1 indicates MSI">\n')
vcf_out.write('##FORMAT=<ID=MSILEN,Number=1,Type=Float,Description="MSI unit repeat length in bp">\n')
vcf_out.write('##FORMAT=<ID=NM,Number=1,Type=Float,Description="Mean mismatches in reads">\n')
vcf_out.write('##FORMAT=<ID=PMEAN,Number=1,Type=Float,Description="Mean position in reads">\n')
vcf_out.write('##FORMAT=<ID=PSTD,Number=1,Type=String,Description="Position STD in reads. Use string because can be NA.">\n')
vcf_out.write('##FORMAT=<ID=QSTD,Number=1,Type=String,Description="Quality score STD in reads. Use string because it can be NA.">\n')
vcf_out.write('##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="Mean quality score in reads">\n')
vcf_out.write('##FORMAT=<ID=SOR,Number=1,Type=String,Description="Odds ratio, could be Inf">\n')
vcf_out.write('##FORMAT=<ID=SSSC,Number=1,Type=Integer,Description="SomaticSniper reported Somatic Score">\n')
vcf_out.write('##FORMAT=<ID=VSSC,Number=1,Type=Integer,Description="VarScan reported Somatic Score">\n')
vcf_out.write('##FORMAT=<ID=JSSC,Number=1,Type=Float,Description="JointSNVMix reported somatic probability">\n')
vcf_out.write('##FORMAT=<ID=DSSF,Number=1,Type=Float,Description="VarDict reported variant p-value">\n')
# Main Header:
try:
if paired_mode:
vcf_out.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\t{}\n'.format(args.normal_sample_name, args.tumor_sample_name) )
elif not paired_mode:
vcf_out.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n'.format(args.tumor_sample_name) )
except NameError:
vcf_out.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' )
# Start calculating scores:
while line_i:
vcf_i = genome.Vcf_line( line_i )
# Initialize stuff:
bina_score = 0
subscore_evidence = 0
bonus_knowledge = 0
penalty_knowledge = 0
snpEff_scores = [0,]
# Apply scoring "matrix" here:
info_item = vcf_i.get_info_items()
### Process the callers (score between 0 to +4):
combo_i = [ vcf_i.get_info_value( tool_i ) for tool_i in tools ]
combo_i = tuple(combo_i)
MVJS_combinations[combo_i][idx_combo_total] += 1
num_smmethods = sum(combo_i)
# Evidence-based score: number of tools:
# To make indel callers and snp caller the same scale:scored.snp2.vcf
subscore_callers = num_smmethods*(5/num_tools)
# Proceed only if there is AT LEAST "min_tools" CALLER calling it SOMATIC:
if num_smmethods >= min_tools:
subscore_filter = 0
# DP4 (SC for Indellocator) has strand info and counts info, so highest precedence. Grab those if available.
got_dp4, got_sc = 'DP4' in vcf_i.get_sample_variable(), 'SC' in vcf_i.get_sample_variable()
if (got_dp4 or got_sc):
probe_dp4 = 'DP4' if got_dp4 else 'SC'
tumor_ref_for,tumor_ref_rev,tumor_alt_for,tumor_alt_rev = vcf_i.get_sample_value(probe_dp4, idxT).split(',')
tumor_ref_for = int( tumor_ref_for )
tumor_ref_rev = int( tumor_ref_rev )
tumor_alt_for = int( tumor_alt_for )
tumor_alt_rev = int( tumor_alt_rev )
tumor_alt_total = tumor_alt_for + tumor_alt_rev
tumor_coverage_depth = sum( (tumor_ref_for, tumor_ref_rev, tumor_alt_for, tumor_alt_rev) )
# For subScoring:
try:
tumor_alt_allele_freq = tumor_alt_total/tumor_coverage_depth
except ZeroDivisionError:
tumor_alt_allele_freq = 0
minor_strand_count = min(tumor_alt_for, tumor_alt_rev)
if paired_mode:
normal_ref_for,normal_ref_rev,normal_alt_for,normal_alt_rev = vcf_i.get_sample_value(probe_dp4, idxN).split(',')
normal_ref_for = int( normal_ref_for )
normal_ref_rev = int( normal_ref_rev )
normal_alt_for = int( normal_alt_for )
normal_alt_rev = int( normal_alt_rev )
normal_alt_total = normal_alt_for + normal_alt_rev
normal_coverage_depth = sum( (normal_ref_for, normal_ref_rev, normal_alt_for, normal_alt_rev) )
try:
normal_alt_allele_freq = normal_alt_total/normal_coverage_depth
except ZeroDivisionError:
normal_alt_allele_freq = 1
# MuTect and JointSNVMix2 has no strand info, so grab those if DP4/SC doesn't exist.
# Following Broad's convention, the first number in AD is refernece, and the followings are alternative. We'll take the 2nd number as the alternative read counts.
elif 'AD' in vcf_i.get_sample_variable():
try:
tumor_alt_total = int( vcf_i.get_sample_value('AD', idxT).split(',')[1] )
tumor_ref_total = int( vcf_i.get_sample_value('AD', idxT).split(',')[0] )
# For subScoring:
tumor_coverage_depth = tumor_alt_total + tumor_ref_total
try:
tumor_alt_allele_freq = tumor_alt_total/tumor_coverage_depth
except ZeroDivisionError:
tumor_alt_allele_freq = 0
except ValueError:
tumor_alt_allele_freq = 0
if paired_mode:
try:
normal_alt_total = int( vcf_i.get_sample_value('AD', idxN).split(',')[1] )
normal_ref_total = int( vcf_i.get_sample_value('AD', idxN).split(',')[0] )
try:
normal_alt_allele_freq = normal_alt_total/(normal_alt_total + normal_ref_total)
except ZeroDivisionError:
normal_alt_allele_freq = 0
except ValueError:
tumor_alt_allele_freq = 0
# Well then, grab DP and see if either RD or AD exists (not both, otherwise it wouldn't get there).
elif 'DP' in vcf_i.get_sample_variable():
tumor_coverage_depth = int( vcf_i.get_sample_value('DP', idxT) )
if paired_mode:
normal_coverage_depth = int( vcf_i.get_sample_value('DP', idxN) )
if 'RD' in vcf_i.get_sample_variable():
tumor_ref_total = int( vcf_i.get_sample_value('RD', idxT) )
tumor_alt_total = tumor_coverage_depth - tumor_ref_total
try:
tumor_alt_allele_freq = tumor_alt_total/tumor_coverage_depth
except ZeroDivisionError:
tumor_alt_allele_freq = 0
if paired_mode:
normal_ref_total = int( vcf_i.get_sample_value('RD', idxN) )
normal_alt_total = normal_coverage_depth - normal_ref_total
try:
normal_alt_allele_freq = normal_alt_total/normal_coverage_depth
except ZeroDivisionError:
normal_alt_allele_freq = 0
else:
tumor_alt_allele_freq = 0
normal_alt_allele_freq = 0
### myFilter Sub-Scoring
### Because some metrics in Filter are not always run, so make sure to delete them after each iteration.
try:
if tumor_alt_allele_freq >= 2*args.alternative_allele_freq:
subscore_filter = subscore_filter + 1
elif tumor_alt_allele_freq >= args.alternative_allele_freq:
subscore_filter = subscore_filter + .5
del tumor_alt_allele_freq
except NameError:
pass
#
try:
if minor_strand_count >= 2*args.minor_strand_count:
subscore_filter = subscore_filter + 1
elif minor_strand_count >= args.minor_strand_count:
subscore_filter = subscore_filter + .5
elif minor_strand_count == 0:
subscore_filter = subscore_filter - 1
del minor_strand_count
except NameError:
pass
#
try:
if tumor_coverage_depth >= 2*args.depth_of_coverage:
subscore_filter = subscore_filter + 1
elif tumor_coverage_depth >= args.depth_of_coverage:
subscore_filter = subscore_filter + .5
del tumor_coverage_depth
except NameError:
pass
#
if paired_mode:
try:
if normal_alt_allele_freq <= 0.5*args.normal_alternative_freq:
subscore_filter = subscore_filter + 1
elif normal_alt_allele_freq <= args.normal_alternative_freq:
subscore_filter = subscore_filter + .5
del normal_alt_allele_freq
except NameError:
pass
### Evidence based score, scale it to 10, and then round to the nearest 0.5.
subscore_evidence = subscore_callers + subscore_filter
subscore_evidence = subscore_evidence * (max_scale / max_subEvidence) # Scale it to 10
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
### Based on knowledge, i.e., COSMIC, and Cancer Gene Census:
## Cosmic Test (score between 0 to +6):
bonus_knowledge = 0
cosm = re.search( pattern_COSM, vcf_i.identifier )
if cosm:
bonus_knowledge = bonus_knowledge + 1
MVJS_combinations[combo_i][idx_combo_cosmic] += 1
# Additional score based on number of samples
cnt = vcf_i.get_info_value('CNT')
if cnt:
try:
cnt = int( cnt )
except ValueError:
cnt = 0
if cnt >= args.extreme_threshold_CNT:
bonus_knowledge = bonus_knowledge + 5
elif cnt >= args.high_threshold_CNT:
bonus_knowledge = bonus_knowledge + 4
elif cnt >= args.medium_threshold_CNT:
bonus_knowledge = bonus_knowledge + 2
elif cnt >= args.low_threshold_CNT:
bonus_knowledge = bonus_knowledge + 1
### Cancer Gene Census Test (score either 0 or 1):
# Look for Gene Name after snpEff in the EFF field.
snpEff = vcf_i.get_info_value('EFF')
if snpEff:
# Once you find where the EFF field is
# There might be multiple annotations, one for each splice variant or whatever:
snpEFFs = snpEff.split(',')
snpEff_scores = []
for IFF_i in snpEFFs:
snp_score_i = 0
snpeff_info = re.match(pattern_EFF, IFF_i)
# Following are annotated by snpEff:
gene = snpeff_info.groupdict()['Gene_Name']
effect_impact = snpeff_info.groupdict()['Effect_Impact']
if gene in cancer_gene_set.keys():
snp_score_i = snp_score_i + 1
# Only care about the SIFT prediction effect if the gene is in gene cancer census
if effect_impact.upper() == 'MODERATE' or effect_impact.upper() == 'HIGH':
snp_score_i = snp_score_i + 1
if args.cancer_indications:
for indi_i in args.cancer_indications:
if re.search(indi_i, cancer_gene_set[gene], re.I):
snp_score_i = snp_score_i + 1
break
snpEff_scores.append( snp_score_i )
# Break from the loop after done with EFF field:
break
### Knowledge based bonus, scale it to 10, and then round to the nearest 0.5.
bonus_knowledge = bonus_knowledge + max(snpEff_scores)
bonus_knowledge = bonus_knowledge * (max_scale / max_subKnowledge) # Scale it to 10
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
## dbSNP Test (score between -3 to 0):
penalty_knowledge = 0
dbsnp = re.search(pattern_rsID, vcf_i.identifier)
if dbsnp:
MVJS_combinations[combo_i][idx_combo_dbsnp] += 1
if vcf_i.get_info_value('COMMON') == '1' or vcf_i.get_info_value('G5') or vcf_i.get_info_value('G5A'):
penalty_knowledge = penalty_knowledge - 1
MVJS_combinations[combo_i][idx_combo_common] += 1
caf = vcf_i.get_info_value('CAF')
if caf:
caf_match = re.sub(r'\.([^0-9])', r'0\g<1>', caf )
try:
caf = list( eval(caf_match) )
caf.sort()
maf = sum(caf[0:-1]) # Minor Allele Frequency
if maf >= args.high_threshold_CAF:
penalty_knowledge = penalty_knowledge - 2
except SyntaxError:
pass
# Ignore this metric when ambiguity arises from different rsID being assigned to the same position.
except TypeError:
pass
# Calculate a "BINA_ONCOSCORE" only if at least one of the callers called it a somatic event:
bina_score = 0.6 * subscore_evidence + 0.4 * bonus_knowledge + penalty_knowledge
bina_score = 0.5 * math.ceil( 2 * bina_score ) # Round to nearest 0.5
subscore_evidence = 0.5 * round( 2 * subscore_evidence ) # Round to nearest 0.5
bonus_knowledge = 0.5 * round( 2 * bonus_knowledge ) # Round to nearest 0.5
if bina_score < 0: bina_score = 0
### FILTER Field String Determinant:
if bina_score:
if bina_score >= args.oncoscore_threshold_pass:
vcf_i.filters = 'PASS'
elif bina_score >= args.oncoscore_threshold_lowconfidence:
vcf_i.filters = 'LowQual'
else:
vcf_i.filters = 'REJECT'
else:
vcf_i.filters = 'REJECT'
# Sort the output lines:
## RE-FORMAT INFO and SAMPLE (FORMAT) columns:
new_info_field = []
# Expected to be flags:
for info_item in ('COMMON', 'CAF', 'G5', 'G5A', 'MUT'):
info_value = vcf_i.get_info_value( info_item )
if info_value == 'true' or info_value == '.':
new_info_field.append( info_item )
elif info_value:
new_info_field.append( '{}={}'.format(info_item, info_value) )
# Expected to have values:
for info_item in ('GENE', 'AA', 'CDS', 'CNT', 'EFF', 'LSEQ', 'RSEQ'):
info_value = vcf_i.get_info_value( info_item )
if info_value == 'true':
new_info_field.append( info_item )
elif info_value == '.':
new_info_field.append( '{}={}'.format(info_item, '0') )
elif info_value:
new_info_field.append( '{}={}'.format(info_item, info_value) )
##
# Add entries:
new_info_field.append( 'NUM_SMMETHODS={}'.format( num_smmethods) )
if num_smmethods >= 1:
sources = []
for call_i, tool_i in zip(combo_i, tools):
if call_i:
sources.append( tool_i )
new_info_field.append( 'SOURCES=' + ','.join(sources) )
new_info_field.append( 'subEvidence=' + str(subscore_evidence) )
new_info_field.append( 'subKnowledge={},{}'.format(bonus_knowledge, penalty_knowledge) )
if bina_score:
new_info_field.append( 'OncoRK=' + str(bina_score) )
### Replacing info field with this one:
new_info_column = ';'.join(new_info_field) # Modified INFO FIELD
### Format the sample info:
new_format, new_normal_sample, new_tumor_sample = [], [], []
# Genotype first, as always
if 'GT' in vcf_i.get_sample_item().keys():
new_format.append( 'GT' )
new_tumor_sample.append( vcf_i.get_sample_value('GT', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('GT', idxN) )
# Followed by DP4, but only if it exists:
if 'DP4' in vcf_i.get_sample_item().keys():
new_format.append( 'DP4' )
new_tumor_sample.append( vcf_i.get_sample_value('DP4', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('DP4', idxN) )
elif 'SC' in vcf_i.get_sample_item().keys():
new_format.append( 'SC' )
new_tumor_sample.append( vcf_i.get_sample_value('SC', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('SC', idxN) )
# If no DP4, get the depth info without strand info:
else:
if 'DP' in vcf_i.get_sample_item().keys():
new_format.append( 'DP' )
new_tumor_sample.append( vcf_i.get_sample_value('DP', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('DP', idxN) )
if 'AD' in vcf_i.get_sample_item().keys():
new_format.append( 'AD' )
new_tumor_sample.append( vcf_i.get_sample_value('AD', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('AD', idxN) )
if 'RD' in vcf_i.get_sample_item().keys():
new_format.append( 'RD' )
new_tumor_sample.append( vcf_i.get_sample_value('RD', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('RD', idxN) )
# Remaining stuff
remaining_variables = ('AMQ', 'BQ', 'MQ', 'VAQ', 'NM', 'PMEAN', 'QSTD', 'PSTD', 'QUAL')
for remaining_i in remaining_variables:
if remaining_i in vcf_i.get_sample_item().keys():
new_format.append( remaining_i )
new_tumor_sample.append( vcf_i.get_sample_value(remaining_i, idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value(remaining_i, idxN) )
## Move INFO's information to Sample Info:
info2sample_variables = ('SHIFT3', 'MSI', 'MSILEN', 'SOR')
for var_i in info2sample_variables:
value_i = vcf_i.get_info_value( var_i )
if value_i:
new_format.append( var_i )
new_tumor_sample.append( value_i )
if paired_mode: new_normal_sample.append( '.' )
# Get the Somatic Scores to sample information:
# SomaticSniper:
if 'SSC' in vcf_i.get_sample_item().keys():
new_format.append( 'SSSC' )
new_tumor_sample.append( vcf_i.get_sample_value('SSC', idxT) )
if paired_mode: new_normal_sample.append( vcf_i.get_sample_value('SSC', idxN) )
# VarScan2:
vssc = vcf_i.get_info_value('SSC')
if vssc:
new_format.append( 'VSSC' )
new_tumor_sample.append(vssc)
if paired_mode: new_normal_sample.append('.')
# JointSNVMix2:
aaab = vcf_i.get_info_value('AAAB')
aabb = vcf_i.get_info_value('AABB')
if aaab and aabb:
aaab = float(aaab)
aabb = float(aabb)
jssc = str( aaab + aabb )
new_format.append( 'JSSC' )
new_tumor_sample.append(jssc)
if paired_mode: new_normal_sample.append('.')
# VarDict:
dssf = vcf_i.get_info_value('SSF')
if dssf:
new_format.append( 'DSSF' )
new_tumor_sample.append(dssf)
if paired_mode: new_normal_sample.append('.')
# Put together:
new_format_string = ':'.join(new_format)
new_tumor_string = ':'.join(new_tumor_sample)
if paired_mode: new_normal_string = ':'.join(new_normal_sample)
### Write:
if paired_mode:
modified_line = '\t'.join((vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters, new_info_column, new_format_string, new_normal_string, new_tumor_string )) + '\n'
else:
modified_line = '\t'.join((vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters, new_info_column, new_format_string, new_tumor_string )) + '\n'
vcf_out.write( modified_line )
##### ##### For making histogram ##### #####
try:
bina_score_tally[bina_score] = bina_score_tally[bina_score] + 1
except KeyError:
bina_score_tally[bina_score] = 1
try:
subscore_evidence_tally[subscore_evidence] = subscore_evidence_tally[subscore_evidence] + 1
except KeyError:
subscore_evidence_tally[subscore_evidence] = 1
try:
bonus_knowledge_tally[bonus_knowledge] = bonus_knowledge_tally[bonus_knowledge] + 1
except KeyError:
bonus_knowledge_tally[bonus_knowledge] = 1
try:
penalty_knowledge_tally[penalty_knowledge] = penalty_knowledge_tally[penalty_knowledge] + 1
except KeyError:
penalty_knowledge_tally[penalty_knowledge] = 1
try:
num_methods_tally[num_smmethods] = num_methods_tally[num_smmethods] + 1
except KeyError:
num_methods_tally[num_smmethods] = 1
## Continue:
line_i = vcf.readline().rstrip()
# Print the histogram onto stderr:
print('BINA_OncoRK:')
for score_i in sorted( bina_score_tally.keys() ):
print(score_i, bina_score_tally[score_i], sep='\t')
print('\nNUM_SMMETHODS:')
for score_i in sorted( num_methods_tally.keys() ):
print(score_i, num_methods_tally[score_i], sep='\t')
print('\nsubEvidence:')
for score_i in sorted( subscore_evidence_tally.keys() ):
print(score_i, subscore_evidence_tally[score_i], sep='\t')
print('\nbonusKnowledge:')
for score_i in sorted( bonus_knowledge_tally.keys() ):
print(score_i, bonus_knowledge_tally[score_i], sep='\t')
print('\npenaltyKnowledge:')
for score_i in sorted( penalty_knowledge_tally.keys() ):
print(score_i, penalty_knowledge_tally[score_i], sep='\t')
# Print combinations:
print('')
print('\t'.join(tools), 'Total', 'dbsnp', 'common', 'COSMIC', sep='\t' )
for mvjs_i in sorted( MVJS_combinations.keys() ):
stats_i = MVJS_combinations[mvjs_i]
for i in mvjs_i:
print(i, end='\t')
for i in stats_i:
print(i, end='\t')
print('')