Merge branch 'develop'

ksahlin · Feb 28, 2019 · d55cabc · d55cabc
2 parents 7c92537 + 19e80e3
commit d55cabc
Show file tree

Hide file tree

Showing 9 changed files with 773 additions and 886 deletions.
diff --git a/modules/cluster_parallel.py → cemetary/cluster_parallel.py b/modules/cluster_parallel.py → cemetary/cluster_parallel.py
diff --git a/cemetary/old_functions.py b/cemetary/old_functions.py
@@ -0,0 +1,135 @@
+def homopolymer_compress(seq):
+    corr = [ n1  for n1,n2 in zip(seq[:-1], seq[1:]) if n1 != n2 ]
+    #last base corner case
+    if seq[-1] != seq[-2]:
+        corr.append(seq[-1])
+    return "".join([nt for nt in corr])
+
+
+
+# def batch_wrt_total_nucl_length(lst, nr_cores=1):
+#     tot_length = sum([len(seq) for i, b_i, acc, seq, qual, score in lst] )
+#     nt_chunk_size = int(tot_length/nr_cores) + 1
+
+#     batch = []
+#     curr_size = 0
+#     for info in lst:
+#         curr_size += len(info[3])
+#         batch.append(info)
+#         if curr_size >= nt_chunk_size:
+#             yield batch
+#             batch = []
+#             curr_size = 0
+#     yield batch
+
+    # l = len(lst)
+    # for ndx in range(0, l, n):
+    #     yield lst[ndx:min(ndx + n, l)]
+
+
+
+#### TMP MEMORY CHECK
+import pickle
+import sys
+# from collections import OrderedDict
+calling code:
+# print("Size:{0}Mb".format( [round( get_pickled_memory(d)/float(1000000), 2) for d in data ] ))
+def get_pickled_memory(data):
+    return sum(sys.getsizeof(pickle.dumps(d)) for d in data)
+    # print("pikled size:", sum(sys.getsizeof(pickle.dumps(d)) for d in data))
+    # numprocs = 10
+    # a = ['a' for i in range(1000000)]
+    # b = [a+[] for i in range(100)]
+
+    # data1 = [b+[] for i in range(numprocs)]
+    # data = [data1+[]] + ['1' for i in range(numprocs-1)]
+    # data3 = [['1'] for i in range(numprocs)]
+    # sizes = OrderedDict()
+    # for idx, data in enumerate((data1, data, data3)):
+    #     sizes['data{}'.format(idx+1)] = sum(sys.getsizeof(pickle.dumps(d))
+    #                                             for d in data)
+
+    # for k, v in sizes.items():
+    #     print("{}: {}".format(k, v))
+
+
+# code for writing statistics for the paper, called in reads_to_clusters function
+    # print("PASS")
+    # print("Total number of reads iterated through:{0}".format(i+1))
+
+    # print("Passed mapping criteria:{0}".format(mapped_passed_criteria))
+    # print("Passed alignment criteria in this process:{0}".format(aln_passed_criteria))
+    # print("Total calls to alignment mudule in this process:{0}".format(aln_called))
+
+    # print("Percent passed mapping criteria:{0}".format( round(100*mapped_passed_criteria/float(i+1), 2) ))
+    # print("Percent passed alignment criteria total:{0}".format( round(100*aln_passed_criteria/float(i+1), 2) ))    
+    # if aln_called > 0:
+    #     print("Percent passed alignment criteria out of number of calls to the alignment module:{0}".format(round(100*aln_passed_criteria/float(aln_called), 2) )) 
+    # print("PIckled:", get_pickled_memory((Cluster, cluster_seq_origin, minimizer_database, new_batch_index)))
+    # print("PIckled2:", get_pickled_memory({ new_batch_index : (Cluster, cluster_seq_origin, minimizer_database, new_batch_index)}))
+
+
+
+
+def next_power_of_2(x):  
+    return 1 if x == 0 else 2**(x - 1).bit_length()
+
+
+def merge_two_dicts(x, y):
+    z = x.copy()   # start with x's keys and values
+    z.update(y)    # modifies z with y's keys and values & returns None
+    return z
+
+
+
+#### FROM GET_SORTED_FASTQ_FOR_CLUSTER
+
+# from collections import OrderedDict
+
+
+# def get_kmer_quals(qual, k):
+#     return [ qual[i : i + k] for i in range(len(qual) - k +1)]
+
+# def expected_number_of_erroneous_kmers(kmer_quals):
+#     sum_of_expectations = 0
+#     for kmer in kmer_quals:
+#         p_not_error = 1.0 
+#         for char_ in set(kmer):
+#             p_not_error *= (1 - 10**( - (ord(char_) - 33)/10.0 ))**kmer.count(char_)
+#         sum_of_expectations += p_not_error
+
+#     return len(kmer_quals) - sum_of_expectations 
+
+# def get_p_no_error_in_kmers_approximate(qual_string, k):
+#     poisson_mean = sum([ qual_string.count(char_) * 10**( - (ord(char_) - 33)/10.0 ) for char_ in set(qual_string)])
+#     error_rate = poisson_mean/float(len(qual_string))
+#     return (1.0 - error_rate)**k #1.0 - min(error_rate * k, 1.0)
+
+# def get_p_error_in_kmer(qual_string, k):
+#     poisson_mean = sum([ qual_string.count(char_) * 10**( - (ord(char_) - 33)/10.0 ) for char_ in set(qual_string)])
+#     error_rate = poisson_mean/float(len(qual_string))
+#     p_error_in_kmer = 1.0 - (1.0 - error_rate)**k
+#     return p_error_in_kmer
+
+
+
+# def calc_score(tup):
+#     l, k = tup
+#     read_array = []
+#     error_rates = []
+#     for i, (acc, seq, qual) in enumerate(l):
+#         if i % 10000 == 0:
+#             print(i, "reads processed.")
+#         poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
+#         error_rate = poisson_mean/float(len(qual))
+#         error_rates.append(error_rate)
+#         exp_errors_in_kmers = expected_number_of_erroneous_kmers_speed(qual, k)
+#         p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
+#         score =  p_no_error_in_kmers  * (len(seq) - k +1)
+#         read_array.append((acc, seq, qual, score) )
+#     return read_array, error_rates
+
+
+
+
+
diff --git a/isONclust b/isONclust
@@ -10,65 +10,48 @@ from time import time
 from modules import get_sorted_fastq_for_cluster
 from modules import cluster
 from modules import p_minimizers_shared
+from modules import help_functions
+from modules import parallelize
+from modules import cluster
+
+
+def single_clustering(read_array, p_emp_probs, args):
+    start_cluster = time()
+    clusters = {} # initialize every read as belonging to its own cluster
+    representatives = {} # initialize every read as its own representative
+    for i, b_i, acc, seq, qual, score in read_array:
+        clusters[i] = [acc]
+        representatives[i] = (i, b_i, acc, seq, qual, score)  
+
+    minimizer_database, new_batch_index = {}, 1 # These data structures are used in multiprocessing mode but. We use one core so only process everything in one "batch" and dont need to pass the minimizer database to later iterations.
+    result_dict = cluster.reads_to_clusters(clusters, representatives, read_array, p_emp_probs, minimizer_database, new_batch_index, args)
+    # Unpack result. The result dictionary structure is convenient for multiprocessing return but clumsy in single core mode.
+    clusters, representatives, _, _ = list(result_dict.values())[0]
+    print("Time elapesd clustering:", time() - start_cluster)
+    return clusters, representatives
 
-'''
-    Below code taken from https://github.com/lh3/readfq/blob/master/readfq.py
-'''
-
-def readfq(fp): # this is a generator function
-    last = None # this is a buffer keeping the last unprocessed line
-    while True: # mimic closure; is it a bad idea?
-        if not last: # the first record or a record following a fastq
-            for l in fp: # search for the start of the next record
-                if l[0] in '>@': # fasta/q header line
-                    last = l[:-1] # save this line
-                    break
-        if not last: break
-        name, seqs, last = last[1:].replace(" ", "_"), [], None
-        for l in fp: # read the sequence
-            if l[0] in '@+>':
-                last = l[:-1]
-                break
-            seqs.append(l[:-1])
-        if not last or last[0] != '+': # this is a fasta record
-            yield name, (''.join(seqs), None) # yield a fasta record
-            if not last: break
-        else: # this is a fastq record
-            seq, leng, seqs = ''.join(seqs), 0, []
-            for l in fp: # read the quality
-                seqs.append(l[:-1])
-                leng += len(l) - 1
-                if leng >= len(seq): # have read enough quality
-                    last = None
-                    yield name, (seq, ''.join(seqs)); # yield a fastq record
-                    break
-            if last: # reach EOF before reading enough quality
-                yield name, (seq, None) # yield a fasta record instead
-                break
-
-
-
-
-def mkdir_p(path):
-    try:
-        os.makedirs(path)
-        print("creating", path)
-    except OSError as exc:  # Python >2.5
-        if exc.errno == errno.EEXIST and os.path.isdir(path):
-            pass
-        else:
-            raise
 
 def main(args):
+    """
+        Code in main function is structures into 4 steps
+        1. Sort all reads according to expected errorfree kmers
+        2. Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length. 
+            This is used for calculating if reads matches representative.
+        3. Cluster the reads
+        4. Write output
+    """
+    ##### Sort all reads according to expected errorfree kmers #####
     args.outfile = os.path.join(args.outfolder, "sorted.fastq")
-
     print("started sorting seqs")
     start = time()
     sorted_reads_fastq_file = get_sorted_fastq_for_cluster.main(args)
-    read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(readfq(open(sorted_reads_fastq_file, 'r')))]
+    read_array = [ (i, 0, acc, seq, qual, float(acc.split("_")[-1])) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(sorted_reads_fastq_file, 'r')))]
     print("elapsed time sorting:", time() - start)
+    #################################################################
+
 
-    print("started imported empirical error probabilities of minimizers shared:")
+    ##### Import precalculated probabilities of minimizer matching given the error rates of reads, kmer length, and window length #####
+    print("Started imported empirical error probabilities of minimizers shared:")
     start = time()
     p_min_shared = p_minimizers_shared.read_empirical_p()
     p_emp_probs = {}
@@ -80,20 +63,27 @@ def main(args):
     print(p_emp_probs)
     print(len(p_emp_probs))
     print("elapsed time imported empirical error probabilities of minimizers shared:", time() - start)
-    # sys.exit()
+    ##################################################################################################################################
+
+    ##### Cluster reads, bulk of code base is here #####
     print("started clustring")
     start = time()
-    clusters, cluster_seq_origin = cluster.cluster_seqs(read_array, p_emp_probs,  args)
-    print("elapsed time clustering:", time() - start)
+    if args.nr_cores > 1:
+        clusters, representatives = parallelize.parallel_clustering(read_array, p_emp_probs, args)
+    else:
+        print("Using 1 core.")
+        clusters, representatives = single_clustering(read_array, p_emp_probs, args)
+    # clusters, representatives = cluster.cluster_seqs(read_array, p_emp_probs,  args)
+    print("Time elapsed clustering:", time() - start)
+    ####################################################
 
-    # reads = {acc: (seq,qual) for (acc, (seq, qual)) in  readfq(open(sorted_reads_fastq_file, 'r'))}
-    # nonclustered_outfile = open(os.path.join(reads_outfolder, "non_clustered.fa"), "w")
 
+    ##### Write output #####
     outfile = open(os.path.join(args.outfolder,  "final_clusters.csv"), "w")
     origins_outfile = open(os.path.join(args.outfolder,  "final_cluster_origins.csv"), "w")
     nontrivial_cluster_index = 0
     for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
-        read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = cluster_seq_origin[c_id]
+        read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = representatives[c_id]
         origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(read_cl_id, "_".join([item for item in acc.split("_")[:-1]]), c_seq, c_qual, score, error_rate))
 
         for r_acc in all_read_acc:
@@ -102,11 +92,9 @@ def main(args):
             nontrivial_cluster_index += 1
     print("Nr clusters larger than 1:", nontrivial_cluster_index) #, "Non-clustered reads:", len(archived_reads))
     print("Nr clusters (all):", len(clusters)) #, "Non-clustered reads:", len(archived_reads))
-
     outfile.close()
     origins_outfile.close()
-
-    # for cl_id, all_read_acc in sorted(clusters.items(), key = lambda x: len(x[1]), reverse=True):
+    ############################
 
 
 def write_fastq(args):
@@ -119,8 +107,8 @@ def write_fastq(args):
             cl_id, acc = items[0], items[1]
             clusters[cl_id].append(acc)
 
-    mkdir_p(args.outfolder)
-    reads = { acc : (seq, qual) for acc, (seq, qual) in readfq(open(args.fastq, 'r'))}
+    help_functions.mkdir_p(args.outfolder)
+    reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))}
 
     for cl_id in clusters:
         r = clusters[cl_id]
@@ -134,30 +122,10 @@ def write_fastq(args):
 
 
 
-    # curr_id = -1
-    # for i, cl_id, acc in enumerate(open(args.clusters, "r")): 
-    #     if i == 0:
-    #         curr_file = open(os.path.join(args.outfolder, str(cl_id) + ".fastq" ), "w")
-    #         seq, qual = reads[acc]
-    #         curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
-
-    #     elif curr_id != cl_id: # new cluster
-    #         curr_file.close()
-    #         curr_file = open(os.path.join(args.outfolder, str(cl_id) + ".fastq" ), "w")
-    #         seq, qual = reads[acc]
-    #         curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
-
-    #     else: # same as before
-    #         seq, qual = reads[acc]
-    #         curr_file.write("{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
-
-    #     curr_id = cl_id
-
-
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="De novo clustering of long-read transcriptome reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.0.3')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.0.4')
 
     parser.add_argument('--fastq', type=str,  default=False, help='Path to consensus fastq file(s)')
     parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')