-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgamma-delta.py
executable file
·360 lines (318 loc) · 15 KB
/
gamma-delta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# gamma-delta.py
# coding=utf-8
#
# Implementation of the gamma-delta algorithm
# This file is part of the https://github.com/LidiaGS/g-d_algorithm.
# Copyright (c) 2019 Universitat Autònoma de Barcelona
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import csv
import argparse
import os
from os.path import isfile, join
import operator
from decimal import Decimal
import datetime
## ARGUMENTS DEFINITION
parser = argparse.ArgumentParser(prog='ARGUMENTS', usage='%(prog)s [options]')
parser.add_argument("-r", "--R", type=str, help="Single-end read sample (FASTQ/FASTA)")
parser.add_argument("-r1", "--R1", type=str, help="Separeted paired-end reads sample, R1 (forward) file (FASTA/FASTQ format)")
parser.add_argument("-r2", "--R2", type=str, help="Separeted paired-end reads sample, R2 (reverse) file (FASTA/FASTQ format)")
parser.add_argument("-m", "--SAMfolder", type=str, help="Folder containing the mapped reads files in SAM format (e.g: mapped_reads.sam).")
parser.add_argument("-g", "--gamma", type=str, help="Gamma threshold (per one unit) [1-0]")
parser.add_argument("-d", "--delta", type=str, help="Delta threshold (per one unit) [1-0]")
parser.add_argument("-o", "--output", type=str, help="Name of the output file (CSV format)")
parser.add_argument("-O", "--outInfo", type=str, help="Output the assignment of each read (CSV format)")
args = parser.parse_args()
## ABREVIATIONS
# A1: Highest mapping ratio
# A2: Second highest mapping ratio
## FUNCTIONS
## Obtaining the mapping ratio (A) read against a reference by using CIGAR and NM from SAM file
def mapping_ratio(read_CIGAR, read_NM):
num_cigar = 0
match_mismatch = 0
insertion = 0
deletion = 0
soft_clipped = 0
hard_clipped = 0
padding = 0
skipped = 0
mismatch = 0
match = 0
NM_mismatch = 0
total_cigar = 0
map_ratio = 0
## Trim CIGAR into parts and join same values:
for cigar in read_CIGAR: # Looping through elements of CIGAR
if cigar.isdigit(): # Save numeric values from CIGAR
num_cigar = num_cigar * 10
num_cigar = int(cigar) + num_cigar
else: # Save numeric values from CIGAR to its cathegory (i.e.: Matching/mismatchig, indels, ...)
if cigar == 'M': ## M: matches & mismatches
match_mismatch = num_cigar + match_mismatch
num_cigar = 0
elif cigar == 'I': ## I: insertion
insertion = num_cigar + insertion
num_cigar = 0
elif cigar == 'D': ## D: deletion
deletion = num_cigar + deletion
num_cigar = 0
elif cigar == 'S': ## S: soft-clipped
soft_clipped = num_cigar + soft_clipped
num_cigar = 0
elif cigar == 'H': ## H: hard-clipped
hard_clipped = num_cigar + hard_clipped
num_cigar = 0
elif cigar == 'P': ## P: Padding
padding = num_cigar + padding
num_cigar = 0
elif cigar == 'N': ## N: Skipped
skipped = num_cigar + skipped
num_cigar = 0
elif cigar == 'X': ## X: Mismatch
mismatch = num_cigar + mismatch
num_cigar = 0
elif cigar == '=': ## =: Match
match = num_cigar + match
num_cigar = 0
elif cigar == '*':
num_cigar = 0
## Length of CIGAR (also reads length)
total_cigar = match_mismatch + insertion + deletion + soft_clipped + hard_clipped + padding + skipped + mismatch + match
## Mapping ratio
map_ratio = (((total_cigar + deletion + skipped) - float(read_NM) - soft_clipped - hard_clipped - mismatch) / (total_cigar + deletion + skipped))
return map_ratio
## Looking for NM information thought columns of SAM rows
def findNM(samLine):
NM = str()
for elem in samLine:
if str(elem[:2])=='NM':
return elem[5:] ## Keeping NM tag value without "NM:i:"
return 0
## When working with paried-end reads, add "/1" and "/2" at the end of the R1 and R2 names, respectively
def rename_paired_end(readName, flag):
if ((int(flag) & 1) == 1): # # Check paired-end read; if true, this are paired-end reads
if ((int(flag) & 128) == 128): # Check second in pair; if true, return R2
return(readName+str("/2"))
else: # R1
return(readName+str("/1"))
else: # Single-end read
return(readName+str(""))
## Check that A1 is above gamma and A2 below delta
def check_distance(ReadDict):
for ReadId in ReadDict:
if (float(ReadDict[ReadId][5]) >= float(args.delta)):
ReadDict[ReadId][9] = "A2-above-delta"
elif ((float(ReadDict[ReadId][0]) < float(args.gamma))):
ReadDict[ReadId][9] = "A1-below-gamma"
else:
ReadDict[ReadId][9] = "Assigned"
return ReadDict
## Saving the mapping information for a given read and reference into the dictionary
def save_map_info(mapRow, mapRef, ReadDict):
mapRow = mapRow.strip('\n')
row = mapRow.split("\t")
ReadId = str(rename_paired_end(row[0],row[1])) # RNAME
# Create read in the dictionary if the key does not exist yet
if (ReadId not in ReadDict):
ReadDict[ReadId] = list([0, '0', 'NA', 'NA', 'NA', 0, 'NA', 'NA', '0', 'NA']) # Create and empty tuple, so A1 and A2 are zero.
# Obtain the mapping ratio of the read in the current row
newRatio = mapping_ratio(row[5], findNM(row)) # CIGAR, NM are picked
# New mapping ratio is higher or equal to A1
if (newRatio >= ReadDict[ReadId][0]):
# Changing A2
ReadDict[ReadId][5] = ReadDict[ReadId][0] # A2 (A2 to old A1)
ReadDict[ReadId][6] = ReadDict[ReadId][1] # Reference id of A2 (A2 to old A1)
ReadDict[ReadId][7] = ReadDict[ReadId][2] # RNAME A2 (A2 to old A1)
ReadDict[ReadId][8] = ReadDict[ReadId][3] # POS A2 (A2 to old A1)
# Changing A1
ReadDict[ReadId][0] = newRatio # A1 (old A1 to new A1)
ReadDict[ReadId][1] = mapRef # Reference id of A1 (old A1 to new A1)
ReadDict[ReadId][2] = row[2] # RNAME A1 (old A1 to new A1)
ReadDict[ReadId][3] = row[3] # POS A1 (old A1 to new A1)
ReadDict[ReadId][4] = row[1] # FLAG A1 (old A1 to new A1)
# New mapping ratio is higher than A2
elif (newRatio > ReadDict[ReadId][5]):
ReadDict[ReadId][5] = newRatio # A2 (old A2 to new A1)
ReadDict[ReadId][6] = mapRef # Reference id of A2 (old A2 to new A1)
ReadDict[ReadId][7] = row[2] # RNAME A2 (old A2 to new A1)
ReadDict[ReadId][8] = row[3] # POS A2 (old A2 to new A1)
return ReadDict
## Read and load SAM files. Notice that SAM header is ignored.
def load_sam_data(samDocs):
ReadDict = dict()
print "Loading mapping information..."
for samfile in samDocs: # Loop through files in folder
refName = samfile.split(".")[0] # The ref code is at the file name
sam = open(samfile, "r")
for row in sam: # Loop through lines in sam file
if row[0][0] != '@': # Avoiding headers
ReadDict = save_map_info(row, refName, ReadDict)
sam.close()
return ReadDict
## Create a dictionary for storing the number of reads assigned to each reference
def summary_dict(ReadDict):
SumDic = dict()
SumDic["A1-below-gamma"] = 0
SumDic["A2-above-delta"] = 0
for ReadId in ReadDict:
## Check if the read has been assigned
if ReadDict[ReadId][9] == "Assigned":
if ReadDict[ReadId][1] in SumDic: # Reference already exists
SumDic[ReadDict[ReadId][1]] += 1
else: # Reference doesn't exist yet
newkey = ReadDict[ReadId][1]
SumDic[newkey] = 1
## If the read has not been assigned, save the reason
elif ReadDict[ReadId][9] == "A1-below-gamma":
SumDic["A1-below-gamma"] += 1
elif ReadDict[ReadId][9] == "A2-above-delta":
SumDic["A2-above-delta"] += 1
return SumDic
# Pop value "valueKey" from dictionary
def popValue(valueKey, ReadDict):
try:
pop_elem = ReadDict.pop(valueKey)
return (pop_elem, ReadDict) # Return the value associated to the key and the Dictionary without the element
except KeyError:
return (0, ReadDict)
# Obtain header column name
def sample_header():
if (args.R is not None):
return os.path.basename(args.R)
elif ((args.R1 is not None) and (args.R2 is not None)):
return str(os.path.basename(args.R1))+" + "+str(os.path.basename(args.R2))
elif (args.SAMfolder is not None):
return args.SAMfolder
elif (args.SAMfile is not None):
return args.SAMfile
else:
return str("Sample:")+str(datetime.datetime.now().time())
# Count the number of reads in file (FASTQ/FASTA formats)
def count_reads_in_file(fileName):
if(fileName[-1:]=="a"): # FASTA file
j = 2
else: # FASTQ file
j = 4
return(len(open(fileName).readlines( ))/j)
# Return the number of NOT mapped reads
def count_notMappedReads(mappedReads):
if (args.R is not None):
totalReads = count_reads_in_file(args.R)
print ("> Initial reads: "+str(totalReads))
return(totalReads -int(mappedReads))
elif ((args.R1 is not None) and (args.R2 is not None)):
totalReads = count_reads_in_file(args.R1)*2
print ("> Initial reads: "+str(totalReads))
return(totalReads -int(mappedReads))
return(0)
## Save on a CSV file the summary information of the detected species together with the total number of reads assigned and their relative species abundance
def save_map_info_csv(OldDict):
totalMapp = sum(OldDict.values())
totalNotMapp = count_notMappedReads(totalMapp)
totalReads = totalMapp + totalNotMapp
print ("> Mapped reads: "+str(totalMapp))
# Pop top two elements from column
rm_by_A1, OldDict = popValue('A1-below-gamma', OldDict)
rm_by_A2, OldDict = popValue('A2-above-delta', OldDict)
totalAssig = sum(OldDict.values())
print ("> "+str(totalAssig)+" reads have been assigned")
## Sort Dictionary by VALUES
sorted_Tupla = sorted(OldDict.items(), key=operator.itemgetter(1), reverse=True)
print sorted_Tupla
## Checking if output file already exists, if so, adding a column to the existing file
exists = os.path.isfile(args.output)
if exists:
# Store configuration file values
csvinput = open(args.output,'r')
reader = csv.reader(csvinput)
lines = csvinput.readlines()
csvoutput = open(os.path.splitext(args.output)[0]+"AUXoutput.csv", 'w')
writer = csv.writer(csvoutput, delimiter=";")
# Write header
outRow = list(lines[0].rstrip("\r\n").split(";"))
outRow.append((str(sample_header())))
writer.writerow(outRow)
# Write Not-mapped, A1-below-gamma and A2-above-delta
outRow = list(lines[1].rstrip("\r\n").split(";"))
outRow.append(("Not-Map"+" ("+str(totalNotMapp)+")"))
writer.writerow(outRow)
outRow = list(lines[2].rstrip("\r\n").split(";"))
outRow.append(('A1-below-gamma'+" ("+str(rm_by_A1)+")"))
writer.writerow(outRow)
outRow = list(lines[3].rstrip("\r\n").split(";"))
outRow.append(('A2-above-delta'+" ("+str(rm_by_A2)+")"))
writer.writerow(outRow)
# Write assignments
for i in range(max(len(OldDict),len(lines)-4)): # Loop through lines in max(file/dictionary)
# Add data from the file
try:
outRow = list(lines[i+4].rstrip("\r\n").split(";")) # +4 for avoiding the first 4 lines
except IndexError: # If the new data has more lines than the old data
outRow = list([0]*len(lines[0].rstrip("\r\n").split(";")))
# Add new column into an existing file
try:
outRow.append(sorted_Tupla[i][0]+" ("+str(sorted_Tupla[i][1])+"|"+str(round(Decimal(float(sorted_Tupla[i][1])/float(totalAssig)),5))+")")
except IndexError: # New data has less lines than old data
outRow.append(0)
writer.writerow(outRow)
os.rename(os.path.splitext(args.output)[0]+"AUXoutput.csv", args.output)
# If output file doesn't exist yet, create it and save the mapping information
else:
csvoutput = open(args.output, 'w')
writer = csv.writer(csvoutput)
writer.writerow(list([str(sample_header())])) # Write sample name
writer.writerow([str("Not-mapping-reads"+" ("+str(totalNotMapp)+")")]) # Write Not-Map
writer.writerow([str('A1-below-gamma'+" ("+str(rm_by_A1)+")")]) # Write 'A1-below-gamma'
writer.writerow([str('A2-above-delta'+" ("+str(rm_by_A2)+")")]) # Write 'A2-above-delta'
for item in sorted_Tupla:
writer.writerow([str(item[0]+" ("+str(item[1])+"|"+str(round(Decimal(float(item[1])/float(totalAssig)),5))+")")])
csvoutput.close()
return
# Save on a CSV file the assignment information of each single read
def save_dic(ReadDict):
csv_columns = ["RNAME","A1","REF_A1","SCAF_A1","POS_A1","FLAG_A1","A2","ref_A2","SCAF_A2","POS_A2","ASSIGNMENT"]
with open(args.outInfo, 'w') as csvoutput:
writer = csv.writer(csvoutput, delimiter=";")
writer.writerow(csv_columns)
for key in ReadDict.keys():
o = ReadDict[key]
o.insert(0,key)
writer.writerow(o)
csvoutput.close()
## MAIN PROGRAMME
def main():
# Saving mapping information (SAM files)
wd = os.getcwd() # Get working directory
if (args.SAMfolder is not None): # Getting all the SAM files from folder
os.chdir(args.SAMfolder)
samfiles = [f for f in os.listdir(args.SAMfolder) if f.endswith(".sam")]
else:
samfol = raw_input("Where are the SAM files?:")
os.chdir(samfol)
samfiles = [f for f in os.listdir(samfol) if f.endswith(".sam")]
ReadDict = load_sam_data(samfiles)
## Check that the A1 and A2 fulfill the gamma and delta thresholds requirements (e.g. A1 above gamma and A2 below delta)
ReadDict = check_distance(ReadDict)
print("Summarizing data...")
os.chdir(wd) # Turn back to the original working directory
summary = summary_dict(ReadDict)
save_map_info_csv(summary)
## Save each read assingment information
if (args.outInfo is not None):
save_dic(ReadDict)
return
## MAIN
main()