diff --git a/.python-version b/.python-version deleted file mode 100644 index 3857eea..0000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -kdb diff --git a/TODO.org b/TODO.org index 2d7070f..036edd1 100644 --- a/TODO.org +++ b/TODO.org @@ -5,6 +5,8 @@ # .kdb files should be debrujin graph databases # The final prototype would be .bgzf format from biopython + +* 12/16/24 | Minimizers (lexicographic with window size) * 10/5/24 | released from hospital (2 days) and working on refactoring kmer module * 9/21/24 | somewhat changing from gpt4o-mini to claude sonnet ** using sonnet to create llc documents diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py index 109ce02..193f7d7 100644 --- a/kmerdb/__init__.py +++ b/kmerdb/__init__.py @@ -204,9 +204,71 @@ def expanded_help(arguments): kmerdb_appmap.print_matrix_header() elif arguments.method == "distance": kmerdb_appmap.print_distance_header() + elif arguments.method == "minimizers": + kmerdb_appmap.print_minimizers_header() + sys.stderr.write("\n\nUse --help for expanded usage\n") +def get_minimizers(arguments): + + from Bio import SeqIO + import numpy as np + from kmerdb import fileutil, config, util, minimizer + import json + metadata = None + N = None + kmer_ids = None + fasta_seqs = [] + mins = None + + assert type(arguments.kdb) is str, "kdb_in must be a str" + kdb_sfx = os.path.splitext(arguments.kdb)[-1] + fa_sfx = os.path.splitext(arguments.fasta)[-1] + + if kdb_sfx != ".kdb" and kdb_sfx != ".kdbg": # A filepath with invalid suffix + raise IOError("Input .kdb filepath '{0}' does not end in '.kdb'".format(arguments.kdb)) + elif not os.path.exists(arguments.kdb): + raise IOError("Input .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb)) + elif fa_sfx != ".fa" and fa_sfx != ".fna" and fa_sfx != ".fasta": + raise IOError("Input .fasta filepath '{0}' does not exist on the filesystem".format(arguments.fasta)) + + + if kdb_sfx == ".kdb": + + with open(arguments.fasta, mode="r") as ifile: + for record in SeqIO.parse(ifile, "fasta"): + fasta_seqs.append(str(record.seq)) + + + with fileutil.open(arguments.kdb, mode='r', slurp=True) as kdb_in: + metadata = kdb_in.metadata + + + kmer_ids_dtype = metadata["kmer_ids_dtype"] + N = 4**metadata["k"] + if metadata["version"] != config.VERSION: + logger.log_it("KDB version is out of date, may be incompatible with current KDBReader class", "WARNING") + kmer_ids = kdb_in.kmer_ids + + for seq in fasta_seqs: + + minimizers = minimizer.select_lexicographical_minimizers(seq, metadata['k'], arguments.window_size, kmer_ids) + + mins_as_list = list(map(lambda s: int(s), minimizers)) + if mins is None: + mins=mins_as_list + else: + for kmer_is_selected, i in enumerate(mins_as_list): + if mins_as_list[i] == 1: + pass + elif kmer_is_selected == 1: + mins_as_list[i] = 1 + # print minimizers in columnar format in new index file format (example.8.kdbi.1) + print(mins) + sys.exit(1) + + def distances(arguments): """ An end-user function to provide CLI access to certain distances @@ -1084,7 +1146,7 @@ def hierarchical(arguments): # from ecopy.diversity import rarefy # import ecopy as ep -# from kdb import config +# from kmerdb import config # if isinstance(arguments.input, argparse.FileType) or isinstance(arguments.input, TextIOWrapper): @@ -1952,12 +2014,13 @@ def _profile(arguments): # Complete collating of counts across files # This technically uses 1 more arrray than necessary 'final_counts' but its okay logger.log_it("Summing counts from individual fasta/fastq files into a composite profile...", "INFO") - + for d in data: counts = counts + d[0] + file_metadata.append(d[1]) - + if np.sum(counts) == 0: raise ValueError("Each element of the array of k-mer counts was 0. Sum of counts == 0. Likely an internal error. Exiting.") @@ -2061,6 +2124,8 @@ def _profile(arguments): logger.log_it("Formatting master metadata dictionary...", "INFO") + + metadata=OrderedDict({ "version": VERSION, @@ -2353,7 +2418,20 @@ def cli(): # assembly_parser.add_argument("kdbg", type=str, help=".kdbg file") # assembly_parser.set_defaults(func=assembly) + minimizer_parser = subparsers.add_parser("minimizers", help="Calculate an binary array of minimizers to associate with a .kdb count vector") + minimizer_parser.add_argument("kdb", type=str, help="A k-mer database file (.kdb)") + minimizer_parser.add_argument("fasta", type=str, help="A fasta file (.fa)") + + minimizer_parser.add_argument("-w", "--window-size", type=int, help="Choice of window size (in base-pairs) to select minimizers with", required=True) + + minimizer_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") + minimizer_parser.add_argument("--debug", action="store_true", default=False, help="Debug mode. Do not format errors and condense log") + minimizer_parser.add_argument("-nl", "--num-log-lines", type=int, choices=config.default_logline_choices, default=50, help=argparse.SUPPRESS) + minimizer_parser.add_argument("-l", "--log-file", type=str, default="kmerdb.log", help=argparse.SUPPRESS) + minimizer_parser.set_defaults(func=get_minimizers) + + usage_parser = subparsers.add_parser("usage", help="provide expanded usage information on parameters and functions provided") usage_parser.add_argument("method", type=str, choices=("usage", "help", "profile", "graph", "index", "shuf", "matrix", "distance"), help="Print expanded usage statement") usage_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") @@ -2527,6 +2605,8 @@ def cli(): version_parser.add_argument("-nl", "--num-log-lines", type=int, choices=config.default_logline_choices, default=50, help=argparse.SUPPRESS) version_parser.set_defaults(func=version) + + # markov_probability_parser = subparsers.add_parser("probability", help=u""" # Calculate the log-odds ratio of the Markov probability of a given sequence from the product (pi) of the transition probabilities(aij) times the frequency of the first k-mer (P(X1)), given the entire k-mer profile of a species. diff --git a/kmerdb/appmap.py b/kmerdb/appmap.py index a9b6610..9954eef 100644 --- a/kmerdb/appmap.py +++ b/kmerdb/appmap.py @@ -234,23 +234,23 @@ -command_1_name, command_2_name, command_3_name, command_4_name, command_5_name, command_6_name, command_7_name, command_8_name, command_9_name, command_10_name, command_11_name, command_12_name, command_13_name, command_14_name = config.subcommands +command_1_name, command_2_name, command_11_name, command_3_name, command_4_name, command_5_name, command_6_name, command_7_name, command_8_name, command_9_name, command_10_name, command_12_name, command_13_name, command_14_name = config.subcommands COMMANDS = [ command_1_name, #"profile", - command_2_name, #"make_graph", - command_3_name, #"get_matrix", + command_2_name, #"graph", + command_11_name, #"get_minimizers", + command_3_name, #"matrix", command_4_name, #"distance", - command_5_name, #"view", - command_6_name, #"header", - command_7_name, #"kmeans", - command_8_name, #"hierarchical", - command_9_name, #"index", + command_5_name, #"kmeans", + command_6_name, #"hierarchical", + command_7_name, #"view", + command_8_name, #"header", + command_9_name, #"index", # command_10_name, #"shuf", - command_11_name, #"usage", - command_12_name, #"help" - command_13_name, #"version" - command_14_name #"composition" + command_12_name, #"usage", + command_13_name, #"help", + command_14_name, #"version"] ] @@ -1942,11 +1942,154 @@ +command_11_description = "Calculate minimizers" +command_11_description_long = """ + Calculate minimizers, outputs a binary array to associate with a index array. + +""" +command_11_parameters = "Parameter of interest is window size" +command_11_inputs = "Input is a v{0} .kdb count vector file".format(config.VERSION) +command_11_usage = "kmerdb minimizers -vv --debug input1.12.kdb > input1.12.minimizers.kdbi" -command_11_description = "K-mer Markov sequence probability feature (Deprecated)" -command_11_description_long = """ + + +COMMAND_11_BANNER = """ + + + + + + + + + + + + +[--------------------------------------------------------------------------------------] + + + + + + + + + [ n a m e ] : - {0} + + description : {1} + +{2} + + + + +-------------------------- + + kmerdb minimizers input1.12.kdb input1.12.minimizers.kdbi + + [-] inputs : + + {3} + + [-] parameters : + + {4} + + + + [-] [ usage ] : {5} + + + + + + + + + + + + + + + + + + +[--------------------------------------------------------------------------------------] +""".format(command_11_name, command_11_description, command_11_description_long, command_11_inputs, command_11_parameters, command_11_usage) + + + +COMMAND_11_PARAMS = OrderedDict({ + "name": "arguments", + "type": "array", + "items": [ + { + "name": "K-mer database file.", + "type": "file", + "value": "The k-mer count proifle to calculate minimizers." + } + ] +}) + + +COMMAND_11_INPUTS = OrderedDict({ + "name": "inputs", + "type": "array", + "items": [ + { + "name": "Input count matrix (.tsv)", + "type": "file", + "value": "File to decompose the k-mer profile into its parts." + } + + ] +}) + +COMMAND_11_FEATURES = OrderedDict({ + "name": "features", + "type": "array", + "items": [ + OrderedDict({ + "name": "linear regression for ", + "shortname": "", + "description" : "" + }), + OrderedDict({ + "name": "", + "shortname": "", + "description": "(Deprecated)" + }) + ] +}) + +COMMAND_11_STEPS = OrderedDict({ + "name": "steps", + "type": "array", + "items": [ + OrderedDict({ + "name": "", + "shortname": "", + "description": "(uhhhh...)", + }), + OrderedDict({ + "name": "", + "shortname": "Shuffle k-mer counts", + "description": "(Deprecated)" + }) + + ] + +}) + + + +command_12_description = "K-mer Markov sequence probability feature (Deprecated)" +command_12_description_long = """ Uses conditional probabilities and multiplication rule along with Markov model of sequence to use likelihood/odds ratios to test likelihood of the sequence(s) given the inputs. Unadjusted for multiple-hypothesis testing. Conditional probability : @@ -1962,13 +2105,13 @@ """ -command_11_parameters = "inputs may be one or more fasta files, and the .kdb files needed for the model's output probability." -command_11_inputs = "Input is a v{0} .kdb count vector file".format(config.VERSION) -command_11_usage = "kmerdb prob --db [--db kmer_count_vector_2.kdb] [query_sequences_2.fasta.gz]" +command_12_parameters = "inputs may be one or more fasta files, and the .kdb files needed for the model's output probability." +command_12_inputs = "Input is a v{0} .kdb count vector file".format(config.VERSION) +command_12_usage = "kmerdb prob --db [--db kmer_count_vector_2.kdb] [query_sequences_2.fasta.gz]" -COMMAND_11_BANNER = """ +COMMAND_12_BANNER = """ @@ -2033,11 +2176,11 @@ [--------------------------------------------------------------------------------------] -""".format(command_11_name, command_11_description, command_11_description_long, command_11_inputs, command_11_parameters, command_11_usage) +""".format(command_12_name, command_12_description, command_12_description_long, command_12_inputs, command_12_parameters, command_12_usage) -COMMAND_11_PARAMS = OrderedDict({ +COMMAND_12_PARAMS = OrderedDict({ "name": "arguments", "type": "array", "items": [ @@ -2050,7 +2193,7 @@ }) -COMMAND_11_INPUTS = OrderedDict({ +COMMAND_12_INPUTS = OrderedDict({ "name": "inputs", "type": "array", "items": [ @@ -2063,7 +2206,7 @@ ] }) -COMMAND_11_FEATURES = OrderedDict({ +COMMAND_12_FEATURES = OrderedDict({ "name": "features", "type": "array", "items": [ @@ -2080,7 +2223,7 @@ ] }) -COMMAND_11_STEPS = OrderedDict({ +COMMAND_12_STEPS = OrderedDict({ "name": "steps", "type": "array", "items": [ @@ -2290,6 +2433,7 @@ "header": COMMAND_6_PARAMS["items"], "index": COMMAND_9_PARAMS["items"], "shuf": COMMAND_10_PARAMS["items"], + "minimizers": COMMAND_11_PARAMS["items"], "composition": COMMAND_14_PARAMS["items"], } @@ -2304,6 +2448,8 @@ "header": COMMAND_6_INPUTS["items"], "index": COMMAND_9_INPUTS["items"], "shuf": COMMAND_10_INPUTS["items"], + + "minimizers": COMMAND_11_INPUTS["items"], "composition": COMMAND_14_INPUTS["items"], } @@ -2318,6 +2464,7 @@ "header": COMMAND_6_FEATURES["items"], "index": COMMAND_9_FEATURES["items"], "shuf": COMMAND_10_FEATURES["items"], + "minimizers": COMMAND_11_FEATURES["items"], "composition": COMMAND_14_FEATURES["items"], } @@ -2335,6 +2482,7 @@ "header": COMMAND_6_STEPS["items"], "index": COMMAND_9_STEPS["items"], "shuf": COMMAND_10_STEPS["items"], + "minimizers": COMMAND_11_FEATURES["items"], "composition": COMMAND_14_STEPS["items"], } diff --git a/kmerdb/config.py b/kmerdb/config.py index c586ada..0987c4f 100644 --- a/kmerdb/config.py +++ b/kmerdb/config.py @@ -17,7 +17,7 @@ -VERSION="0.8.10" +VERSION="0.8.11" REQUIRES_PYTHON="3.7.4" header_delimiter = "\n" + ("="*24) + "\n" @@ -29,8 +29,8 @@ # 4/9/24 there are some duplicates sure, but the requirements evolve faster than the dev do, are more essential for function, and I dont change the -dev file much. # 12 subcommands, and associated function names in __init__.py -subcommands = ["profile", "graph", "matrix", "distance", "kmeans", "hierarchical", "view", "header", "index", "shuf", "usage", "help", "version", "composition"] -subcommand_functions = ["profile", "make_graph", "get_matrix", "distances", "kmeans", "hierarchical", "view", "header", "index_file", "shuf", "expanded_help", "expanded_help", "version", "composition"] +subcommands = ["profile", "graph", "get_minimizers", "matrix", "distance", "kmeans", "hierarchical", "view", "header", "index", "shuf", "usage", "help", "version"] +subcommand_functions = ["profile", "make_graph", "get_minimizers", "get_matrix", "distances", "kmeans", "hierarchical", "view", "header", "index_file", "shuf", "expanded_help", "expanded_help", "version"] default_logline_choices = (20, 50, 100, 200) KDB_COLUMN_NUMBER = 4 diff --git a/kmerdb/minimizer.py b/kmerdb/minimizer.py new file mode 100644 index 0000000..b6ce9a3 --- /dev/null +++ b/kmerdb/minimizer.py @@ -0,0 +1,59 @@ +''' + Copyright 2020 Matthew Ralston + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +''' +import sys +import os + +#import yaml +#from collections import OrderedDict + +import numpy as np + +from kmerdb import kmer, config, util + +def select_lexicographical_minimizers(seq, k, window_size, kmer_ids): + """ + Select k-mers based on lexicographical minimizers and return a binary array indicating selected k-mers. + + :param seq: Input DNA sequence + :type seq: str + :param k: Choice of k + :type k: int + :param window_size: Size of the sliding window + :type window_size: int + :param kmer_ids: Array of kmer IDs (1 to 4^k) + :type kmer_ids: list + + :returns: Binary array where 1 indicates selected k-mer, 0 otherwise + :rtype: np.array + """ + + N = len(kmer_ids) + + if len(seq) < k: + raise ValueError("Sequence length was less than k") + + minimizers = np.zeros(N, dtype="int16") + + for i in range(N - k + 1): + if i % window_size == 0: + subseq = seq[i:i+k] + kmer_id = kmer.kmer_to_id(subseq) + minimizers[kmer_id] = 1 + + + return minimizers + diff --git a/kmerdb/parse.py b/kmerdb/parse.py index 7fe642b..e5f4d7b 100644 --- a/kmerdb/parse.py +++ b/kmerdb/parse.py @@ -168,8 +168,15 @@ def parsefile(filepath:str, k:int, logger=None, quiet=True): #rows_per_batch:int seqprsr.max_read_length = max(read_lengths) seqprsr.min_read_length = min(read_lengths) - seqprsr.avg_read_length = np.mean(np.array(read_lengths, dtype="uint32")) - + + + seqprsr.avg_read_length = int(np.mean(np.array(read_lengths, dtype="uint32"))) + + + + + + sus = set() @@ -252,7 +259,7 @@ def parsefile(filepath:str, k:int, logger=None, quiet=True): #rows_per_batch:int header = seqprsr.header_dict() header["min_read_length"] = min_read_length header["max_read_length"] = max_read_length - header["avg_read_length"] = np.mean(read_lengths) + header["avg_read_length"] = int(np.mean(read_lengths)) seqprsr.nullomer_array = np.array(all_theoretical_kmer_ids, dtype="uint64")[is_nullomer] diff --git a/pyproject.toml b/pyproject.toml index e0202d0..b63e482 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ build-backend = "setuptools.build_meta" [project] name = "kmerdb" -version = "0.8.10" +version = "0.8.11" description = "Yet another correction to the 'yet another correction to just a k-mer counter...'" readme = "README.md" authors = [{name="Matt Ralston ", email="mralston.development@gmail.com"}] diff --git a/setup.py b/setup.py index dc1fdc0..c5bcb43 100644 --- a/setup.py +++ b/setup.py @@ -110,7 +110,7 @@ def can_import(module_name): long_description = 'See README.md for details' #REQUIRES_PYTHON = ">=3.7.4" REQUIRES_PYTHON = '>=3.12.2' -VERSION = "0.8.10" +VERSION = "0.8.11" URL = 'https://github.com/MatthewRalston/kmerdb' CURRENT_RELEASE = "https://github.com/MatthewRalston/kmerdb/archive/v{0}.tar.gz".format(VERSION) EMAIL = 'mralston.development@gmail.com'