Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
Kristoffer Sahlin committed Feb 28, 2019
2 parents 7c92537 + 19e80e3 commit d55cabc
Show file tree
Hide file tree
Showing 9 changed files with 773 additions and 886 deletions.
File renamed without changes.
135 changes: 135 additions & 0 deletions cemetary/old_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
def homopolymer_compress(seq):
corr = [ n1 for n1,n2 in zip(seq[:-1], seq[1:]) if n1 != n2 ]
#last base corner case
if seq[-1] != seq[-2]:
corr.append(seq[-1])
return "".join([nt for nt in corr])



# def batch_wrt_total_nucl_length(lst, nr_cores=1):
# tot_length = sum([len(seq) for i, b_i, acc, seq, qual, score in lst] )
# nt_chunk_size = int(tot_length/nr_cores) + 1

# batch = []
# curr_size = 0
# for info in lst:
# curr_size += len(info[3])
# batch.append(info)
# if curr_size >= nt_chunk_size:
# yield batch
# batch = []
# curr_size = 0
# yield batch

# l = len(lst)
# for ndx in range(0, l, n):
# yield lst[ndx:min(ndx + n, l)]



#### TMP MEMORY CHECK
import pickle
import sys
# from collections import OrderedDict
calling code:
# print("Size:{0}Mb".format( [round( get_pickled_memory(d)/float(1000000), 2) for d in data ] ))
def get_pickled_memory(data):
return sum(sys.getsizeof(pickle.dumps(d)) for d in data)
# print("pikled size:", sum(sys.getsizeof(pickle.dumps(d)) for d in data))
# numprocs = 10
# a = ['a' for i in range(1000000)]
# b = [a+[] for i in range(100)]

# data1 = [b+[] for i in range(numprocs)]
# data = [data1+[]] + ['1' for i in range(numprocs-1)]
# data3 = [['1'] for i in range(numprocs)]
# sizes = OrderedDict()
# for idx, data in enumerate((data1, data, data3)):
# sizes['data{}'.format(idx+1)] = sum(sys.getsizeof(pickle.dumps(d))
# for d in data)

# for k, v in sizes.items():
# print("{}: {}".format(k, v))


# code for writing statistics for the paper, called in reads_to_clusters function
# print("PASS")
# print("Total number of reads iterated through:{0}".format(i+1))

# print("Passed mapping criteria:{0}".format(mapped_passed_criteria))
# print("Passed alignment criteria in this process:{0}".format(aln_passed_criteria))
# print("Total calls to alignment mudule in this process:{0}".format(aln_called))

# print("Percent passed mapping criteria:{0}".format( round(100*mapped_passed_criteria/float(i+1), 2) ))
# print("Percent passed alignment criteria total:{0}".format( round(100*aln_passed_criteria/float(i+1), 2) ))
# if aln_called > 0:
# print("Percent passed alignment criteria out of number of calls to the alignment module:{0}".format(round(100*aln_passed_criteria/float(aln_called), 2) ))
# print("PIckled:", get_pickled_memory((Cluster, cluster_seq_origin, minimizer_database, new_batch_index)))
# print("PIckled2:", get_pickled_memory({ new_batch_index : (Cluster, cluster_seq_origin, minimizer_database, new_batch_index)}))




def next_power_of_2(x):
return 1 if x == 0 else 2**(x - 1).bit_length()


def merge_two_dicts(x, y):
z = x.copy() # start with x's keys and values
z.update(y) # modifies z with y's keys and values & returns None
return z



#### FROM GET_SORTED_FASTQ_FOR_CLUSTER

# from collections import OrderedDict


# def get_kmer_quals(qual, k):
# return [ qual[i : i + k] for i in range(len(qual) - k +1)]

# def expected_number_of_erroneous_kmers(kmer_quals):
# sum_of_expectations = 0
# for kmer in kmer_quals:
# p_not_error = 1.0
# for char_ in set(kmer):
# p_not_error *= (1 - 10**( - (ord(char_) - 33)/10.0 ))**kmer.count(char_)
# sum_of_expectations += p_not_error

# return len(kmer_quals) - sum_of_expectations

# def get_p_no_error_in_kmers_approximate(qual_string, k):
# poisson_mean = sum([ qual_string.count(char_) * 10**( - (ord(char_) - 33)/10.0 ) for char_ in set(qual_string)])
# error_rate = poisson_mean/float(len(qual_string))
# return (1.0 - error_rate)**k #1.0 - min(error_rate * k, 1.0)

# def get_p_error_in_kmer(qual_string, k):
# poisson_mean = sum([ qual_string.count(char_) * 10**( - (ord(char_) - 33)/10.0 ) for char_ in set(qual_string)])
# error_rate = poisson_mean/float(len(qual_string))
# p_error_in_kmer = 1.0 - (1.0 - error_rate)**k
# return p_error_in_kmer



# def calc_score(tup):
# l, k = tup
# read_array = []
# error_rates = []
# for i, (acc, seq, qual) in enumerate(l):
# if i % 10000 == 0:
# print(i, "reads processed.")
# poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
# error_rate = poisson_mean/float(len(qual))
# error_rates.append(error_rate)
# exp_errors_in_kmers = expected_number_of_erroneous_kmers_speed(qual, k)
# p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
# score = p_no_error_in_kmers * (len(seq) - k +1)
# read_array.append((acc, seq, qual, score) )
# return read_array, error_rates





132 changes: 50 additions & 82 deletions isONclust
Original file line number Diff line number Diff line change
Expand Up @@ -10,65 +10,48 @@ from time import time
from modules import get_sorted_fastq_for_cluster
from modules import cluster
from modules import p_minimizers_shared
from modules import help_functions
from modules import parallelize
from modules import cluster


def single_clustering(read_array, p_emp_probs, args):
start_cluster = time()
clusters = {} # initialize every read as belonging to its own cluster
representatives = {} # initialize every read as its own representative
for i, b_i, acc, seq, qual, score in read_array:
clusters[i] = [acc]
representatives[i] = (i, b_i, acc, seq, qual, score)

minimizer_database, new_batch_index = {}, 1 # These data structures are used in multiprocessing mode but. We use one core so only process everything in one "batch" and dont need to pass the minimizer database to later iterations.
result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
# Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
clusters, representatives, _, _ = list(result_dict.values())[0]
print("Time elapesd clustering:", time() - start_cluster)
return clusters, representatives

'''
Below code taken from https://github.com/lh3/readfq/blob/master/readfq.py
'''

def readfq(fp): # this is a generator function
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last: break
name, seqs, last = last[1:].replace(" ", "_"), [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, (''.join(seqs), None) # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, (seq, ''.join(seqs)); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, (seq, None) # yield a fasta record instead
break




def mkdir_p(path):
try:
os.makedirs(path)
print("creating", path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise

def main(args):
"""
Code in main function is structures into 4 steps
1. Sort all reads according to expected errorfree kmers
2. Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length.
This is used for calculating if reads matches representative.
3. Cluster the reads
4. Write output
"""
##### Sort all reads according to expected errorfree kmers #####
args.outfile = os.path.join(args.outfolder, "sorted.fastq")

print("started sorting seqs")
start = time()
sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(readfq(open(sorted_reads_fastq_file, 'r')))]
read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
print("elapsed time sorting:", time() - start)
#################################################################


print("started imported empirical error probabilities of minimizers shared:")
##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
print("Started imported empirical error probabilities of minimizers shared:")
start = time()
p_min_shared = p_minimizers_shared.read_empirical_p()
p_emp_probs = {}
Expand All @@ -80,20 +63,27 @@ def main(args):
print(p_emp_probs)
print(len(p_emp_probs))
print("elapsed time imported empirical error probabilities of minimizers shared:", time() - start)
# sys.exit()
##################################################################################################################################

##### Cluster reads, bulk of code base is here #####
print("started clustring")
start = time()
clusters, cluster_seq_origin = cluster.cluster_seqs(read_array, p_emp_probs, args)
print("elapsed time clustering:", time() - start)
if args.nr_cores > 1:
clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
else:
print("Using 1 core.")
clusters, representatives = single_clustering(read_array, p_emp_probs, args)
# clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs, args)
print("Time elapsed clustering:", time() - start)
####################################################

# reads = {acc: (seq,qual) for (acc, (seq, qual)) in readfq(open(sorted_reads_fastq_file, 'r'))}
# nonclustered_outfile = open(os.path.join(reads_outfolder, "non_clustered.fa"), "w")

##### Write output #####
outfile = open(os.path.join(args.outfolder, "final_clusters.csv"), "w")
origins_outfile = open(os.path.join(args.outfolder, "final_cluster_origins.csv"), "w")
nontrivial_cluster_index = 0
for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = cluster_seq_origin[c_id]
read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = representatives[c_id]
origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(read_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))

for r_acc in all_read_acc:
Expand All @@ -102,11 +92,9 @@ def main(args):
nontrivial_cluster_index += 1
print("Nr clusters larger than 1:", nontrivial_cluster_index) #, "Non-clustered reads:", len(archived_reads))
print("Nr clusters (all):", len(clusters)) #, "Non-clustered reads:", len(archived_reads))

outfile.close()
origins_outfile.close()

# for cl_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
############################


def write_fastq(args):
Expand All @@ -119,8 +107,8 @@ def write_fastq(args):
cl_id, acc = items[0], items[1]
clusters[cl_id].append(acc)

mkdir_p(args.outfolder)
reads = { acc : (seq, qual) for acc, (seq, qual) in readfq(open(args.fastq, 'r'))}
help_functions.mkdir_p(args.outfolder)
reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))}

for cl_id in clusters:
r = clusters[cl_id]
Expand All @@ -134,30 +122,10 @@ def write_fastq(args):



# curr_id = -1
# for i, cl_id, acc in enumerate(open(args.clusters, "r")):
# if i == 0:
# curr_file = open(os.path.join(args.outfolder, str(cl_id) + ".fastq" ), "w")
# seq, qual = reads[acc]
# curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))

# elif curr_id != cl_id: # new cluster
# curr_file.close()
# curr_file = open(os.path.join(args.outfolder, str(cl_id) + ".fastq" ), "w")
# seq, qual = reads[acc]
# curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))

# else: # same as before
# seq, qual = reads[acc]
# curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))

# curr_id = cl_id



if __name__ == '__main__':
parser = argparse.ArgumentParser(description="De novo clustering of long-read transcriptome reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action='version', version='%(prog)s 0.0.3')
parser.add_argument('--version', action='version', version='%(prog)s 0.0.4')

parser.add_argument('--fastq', type=str, default=False, help='Path to consensus fastq file(s)')
parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
Expand Down
Loading

0 comments on commit d55cabc

Please sign in to comment.