-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnv_vcf_AF.py
74 lines (63 loc) · 2.87 KB
/
cnv_vcf_AF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import vcf
import os
class cnv_AF:
def __init__(self,cnv_directory):
self.cnvD = cnv_directory
self.outputFolder = "/home/gnks/Desktop/sevda/allele_frequency"
#self.cnvs = self.get_cnvs()
#self.commandMaker()
#self.count_frequency()
self.to_bed()
def get_cnvs(self):
cnvs_list=[] # creating an empty list to get cnv vcfs
for i in os.listdir(self.cnvD): # iterate through the cnv_directory
#gets list of cnvs:
if i.endswith('cnv.vcf.gz'):
#id = os.path.basename(i)[:-11]
path = os.path.join(self.cnvD, i)
cnvs_list.append(path)
return cnvs_list
def commandMaker(self):
mergeCommand = "bcftools merge " + " ".join(self.cnvs) + " > output_new.txt"
os.system(mergeCommand)
def count_frequency(self):
vcf_reader = vcf.Reader(open('output_new.txt', 'r'))
num_samples = len(vcf_reader.samples)
#print(num_samples)
# Create a dictionary to store the counts of mutations for each variant
#variant_counts = {}
# Create a new VCF writer
vcf_writer = vcf.Writer(open('output_with_info_FINAL.txt', 'w'), vcf_reader)
print(vcf_writer)
# Iterate through each record in the VCF file
for record in vcf_reader:
# Get the variant key (chromosome and position)
#variant_key = f"{record.CHROM}:{record.POS}"
# Initialize the count for this variant to 0
#variant_counts[variant_key] = 0
count = 0
# Iterate through each sample in the record
for sample in record.samples:
# Get the GT value for the sample
gt_value = sample['GT']
# If the GT value is not "./." or "." (i.e. not a reference genotype or missing), increment the variant count
if gt_value != './.' and gt_value != '.':
count += 1
# Add the new INFO fields to the record
record.add_info("TM", count)
record.add_info("SN", num_samples)
record.add_info("AF", round(count / num_samples, 6))
# Write the updated record to the new VCF file
vcf_writer.write_record(record)
vcf_writer.close()
def to_bed(self):
with open('output_with_info_FINAL.txt', 'r') as vcf_file, open('cnv.bed', 'w') as bed_file:
for line in vcf_file:
if line.startswith("#"):
continue
fields = line.strip().split("\t")
chrom, start = fields[0], fields[1]
af = fields[7].split("AF=")[1].split(";")[0]
end = fields[7].split("END=")[1].split(";")[0]
bed_file.write(f"{chrom}\t{start}\t{end}\t{af}\n")
cnv_AF("/home/gnks/Desktop/sevda/allele_frequency")