-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathConsensusClusters.py
92 lines (76 loc) · 2.71 KB
/
ConsensusClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Authors: L. Penter, L. Hansmann
# Hematology, Oncology, and Tumor Immunology, Charité - Universitätsmedizin Berlin
import os
import random
import subprocess
import tempfile
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Align.Applications import MuscleCommandline
import CdrExtractionOptions
# return consensus sequences for each cluster in file
def ConsensusClusters(filename):
clusters = []
cluster_counter = -1
# read in sequencing reads from file
sequences = ReadSequences(filename)
# generate temporary file
tmp_file = tempfile.NamedTemporaryFile(delete=False)
tmp_file.close()
# calculate clusters of sequencing reads using cd-hit-est
subprocess.check_output([CdrExtractionOptions.CD_HIT_EST, '-i', str(filename), '-o', str(tmp_file.name)] + CdrExtractionOptions.CD_HIT_EST_OPTIONS)
# process cluster information
f = open(tmp_file.name + '.clstr', 'rU')
for line in f:
# start of new cluster
if line[0] == '>':
clusters.append([])
cluster_counter += 1
else:
sequence_number = line[line.find('>')+1:line.find('...')]
clusters[cluster_counter].append(sequences[sequence_number])
# delete temporary files
os.unlink(tmp_file.name)
os.unlink(tmp_file.name+'.clstr')
return ConsensusSequences(clusters)
# generate consensus sequences for groups of reads
def ConsensusSequences(clusters):
reads = []
# go through each cluster of sequencing reads provided
for g in clusters:
# temporary files
tmp_file = tempfile.NamedTemporaryFile(delete=False)
tmp_file2 = tempfile.NamedTemporaryFile(delete=False)
tmp_file2.close()
# if group is larger than MAX_READS_FOR_CONSENSUS reads, select random sample of MAX_READS_FOR_CONSENSUS
if len(g) > CdrExtractionOptions.MAX_READS_FOR_CONSENSUS:
subsample = random.sample(g, CdrExtractionOptions.MAX_READS_FOR_CONSENSUS)
else:
subsample = g # use all available reads
# store cluster in temporary file
for i in range(0, len(subsample)):
tmp_file.write ('>' + str(i) + '\n')
tmp_file.write (subsample[i] + '\n')
tmp_file.close()
# generate consensus read using muscle
muscle_cline = MuscleCommandline(input=tmp_file.name, out=tmp_file2.name, maxiters=2)
muscle_cline()
align = AlignIO.read(tmp_file2.name, 'fasta')
summary_align = AlignInfo.SummaryInfo(align)
# store consensus read in list
reads.append([summary_align.gap_consensus(ambiguous=''), len(g)])
# remove tmp files
os.unlink(tmp_file.name)
os.unlink(tmp_file2.name)
return reads
# read all sequences from fasta file
def ReadSequences(filename):
sequences = {}
identifier = ''
f = open(filename, 'rU')
for line in f:
if line[0] != '>':
sequences[identifier] = line.strip()
else:
identifier = line.strip()[1:]
return sequences