-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSNP_Locator.py
87 lines (60 loc) · 1.91 KB
/
SNP_Locator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
#This code takes a .FASTA that lists both the query and hits outputs from
#Tassel 3.0, locates the bp where the SNP is located and returns
#this information and the # of bp either side of the SNP
with open('HapMap.fas.txt') as f:
SNPList = [line.rstrip() for line in f]
#This separates the input into the sequence data and the fasta names
#relies on the fact that they come one after the other in the input file
Names = SNPList[0:len(SNPList):2]
Sequence = SNPList[1:len(SNPList):2]
#Merge the two lists into a single dictionary, thereby pairing sequence data with its ID
FASTApair = dict(zip(Names, Sequence))
#Empty dict to start
query = {}
hit = {}
for key in list(FASTApair.keys()):
if key[-9:-2] == '_query_':
name = key[:-9]
seq = FASTApair[key]
query[name] = seq
elif key[-7:-2] == '_hit_':
name = key[:-7]
seq = FASTApair[key]
hit[name] = seq
else:
print('bad SNP name up in this joint:', key)
# Now have two dictionaries, one with hits and one with queries
# can compare the values in the two to see where the SNP difference is
SNP_locations = []
for key in list(query.keys()):
SNP = key
query_seq = query[key]
hit_seq = hit[key]
pos = 1
while query_seq != '':
if query_seq[0] != hit_seq[0]:
SNP_locations.append([SNP,pos,query_seq[0],hit_seq[0]])
break
elif query_seq[0] == hit_seq[0]:
query_seq = query_seq[1:]
hit_seq = hit_seq[1:]
pos += 1
continue
#cleanup, turn into a series of strings, and adding \n
SNP_locations.sort()
Final_List = []
for x in SNP_locations:
name = x[0]
position = str(x[1])
allele1 = x[2]
allele2 = x[3]
z = '%s,%s,%s,%s' % (name, position, allele1, allele2)
Final_List.append(z)
header ='SNP,bp_location,query_allele,hit_allele\n'
output = '\n'.join([str(x) for x in Final_List])
#Print the output with a header
file = open('List_SNP_locations.txt','w')
file.write(header)
file.write(output)
file.close()