diff --git a/TODO.org b/TODO.org index 036edd1..4b91ee0 100644 --- a/TODO.org +++ b/TODO.org @@ -6,7 +6,21 @@ # .kdb files should be debrujin graph databases # The final prototype would be .bgzf format from biopython -* 12/16/24 | Minimizers (lexicographic with window size) +* 1/13/25 | Minimizers (lexicographic with window size) + +** Use the most 5? abundant (wrt original genome/kdb) minimizers + +*** Filtering the alignment seeds/kmers by abundance should be a good heuristic for creating biologically meaningful seed regions for alignment. + +** The --window-size parameter is important for kmer/minimizer filtering and restricting the number of Smith Waterman alignments that must be done. + +** Refactor minimizers.py to create a function that takes a fasta sequence as input, and returns a minimizer array. (lift and shift from get_minimizers in 0.8.11) + +** [0/3] Final alignment command should manually produce temporary files: + +- [ ] .kdb from reference (abundances) +- [ ] .kdbi.1 from reference (minimizers) +- [ ] .kdbi.1 from query (seeds/minimizers) :TODO: * 10/5/24 | released from hospital (2 days) and working on refactoring kmer module * 9/21/24 | somewhat changing from gpt4o-mini to claude sonnet ** using sonnet to create llc documents diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py index 193f7d7..865b625 100644 --- a/kmerdb/__init__.py +++ b/kmerdb/__init__.py @@ -226,21 +226,61 @@ def get_minimizers(arguments): kdb_sfx = os.path.splitext(arguments.kdb)[-1] fa_sfx = os.path.splitext(arguments.fasta)[-1] - if kdb_sfx != ".kdb" and kdb_sfx != ".kdbg": # A filepath with invalid suffix + if kdb_sfx != ".kdb": # A filepath with invalid suffix raise IOError("Input .kdb filepath '{0}' does not end in '.kdb'".format(arguments.kdb)) elif not os.path.exists(arguments.kdb): raise IOError("Input .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb)) elif fa_sfx != ".fa" and fa_sfx != ".fna" and fa_sfx != ".fasta": raise IOError("Input .fasta filepath '{0}' does not exist on the filesystem".format(arguments.fasta)) + + mins = minimizer.minimizers_from_fasta_and_kdb(arguments.fasta, arguments.kdb, arguments.window_size) - if kdb_sfx == ".kdb": + output_index_file = "{0}.kdbi.1".format(str(arguments.kdb).strip(".kdb")) + logger.log_it("\n\nPrinting results of minimizers to '{0}'\n\n".format(output_index_file), "INFO") + with open(output_index_file, "w") as ofile: + for kmer, i in enumerate(kmer_ids): + ofile.write("{0}\t{1}\n".format(kmer, mins[i])) + + +def get_alignments(arguments): + """ + Produces Smith-Waterman alignment by loading minimizers + + """ + + from Bio import SeqIO + + minimizers = [] + reference_fasta_seqs = [] + query_fasta_seqs = [] + + kdbi_sfx = ".".join(os.path.splitext(arguments.kdbi)[-2:]) + + print(kdbi_sfx) + sys.exit(1) + + if kdbi_sfx != ".kdbi.1": # A filepath with invalid suffix + raise IOError("Input .kdb index filepath '{0}' does not end in '.kdb'".format(arguments.kdbi1)) + + + + # Create as temporary file, hold minimizer array in memory + + with open(arguments.reference, mode="r") as ifile: + for record in SeqIO.parse(ifile, "fasta"): + reference_fasta_seqs.append(str(record.seq)) + + kdb_sfx = os.path.splitext(arguments.kdb)[-1] - with open(arguments.fasta, mode="r") as ifile: - for record in SeqIO.parse(ifile, "fasta"): - fasta_seqs.append(str(record.seq)) + + if kdb_sfx != ".kdb": # A filepath with invalid suffix + raise IOError("Viewable .kdb filepath does not end in '.kdb'") + elif not os.path.exists(arguments.kdb): + raise IOError("Viewable .kdb filepath '{0}' does not exist on the filesystem".format(arguments.kdb) - + ) + if kdb_sfx == ".kdb": with fileutil.open(arguments.kdb, mode='r', slurp=True) as kdb_in: metadata = kdb_in.metadata @@ -249,25 +289,20 @@ def get_minimizers(arguments): N = 4**metadata["k"] if metadata["version"] != config.VERSION: logger.log_it("KDB version is out of date, may be incompatible with current KDBReader class", "WARNING") - kmer_ids = kdb_in.kmer_ids + if arguments.header: + yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) + print(yaml.dump(metadata, sort_keys=False)) + print(config.header_delimiter) + logger.log_it("Reading from file...", "INFO") + + for i, idx in enumerate(kmer_ids_sorted_by_count): + kmer_id = kdb_in.kmer_ids[i] - for seq in fasta_seqs: - minimizers = minimizer.select_lexicographical_minimizers(seq, metadata['k'], arguments.window_size, kmer_ids) + print("{0}\t{1}\t{2}\t{3}".format(i, kmer_id, kdb_in.counts[kmer_id], kdb_in.frequencies[kmer_id])) - mins_as_list = list(map(lambda s: int(s), minimizers)) - if mins is None: - mins=mins_as_list - else: - for kmer_is_selected, i in enumerate(mins_as_list): - if mins_as_list[i] == 1: - pass - elif kmer_is_selected == 1: - mins_as_list[i] = 1 - # print minimizers in columnar format in new index file format (example.8.kdbi.1) - print(mins) - sys.exit(1) + def distances(arguments): """ @@ -2430,7 +2465,15 @@ def cli(): minimizer_parser.add_argument("-l", "--log-file", type=str, default="kmerdb.log", help=argparse.SUPPRESS) minimizer_parser.set_defaults(func=get_minimizers) - + alignment_parser = subparsers.add_parser("alignment", help="Create a Smith-Waterman-like alignment using a reference fasta, its minimizers, and query sequence, and its minimizers.") + alignment_parser.add_argument("reference", type=str, help="Reference sequences in .fa/.fna/.fasta format") + alignment_parser.add_argument("query", type=str, help="Query sequences in .fa/.fna/.fasta format") + + alignment_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count") + alignment_parser.add_argument("--debug", action="store_true", default=False, help="Debug mode. Do not format errors and condense log") + alignment_parser.add_argument("-nl", "--num-log-lines", type=int, choices=config.default_logline_choices, default=50, help=argparse.SUPPRESS) + alignment_parser.add_argument("-l", "--log-file", type=str, default="kmerdb.log", help=argparse.SUPPRESS) + alignment_parser.set_defaults(func=get_alignments) usage_parser = subparsers.add_parser("usage", help="provide expanded usage information on parameters and functions provided") usage_parser.add_argument("method", type=str, choices=("usage", "help", "profile", "graph", "index", "shuf", "matrix", "distance"), help="Print expanded usage statement") diff --git a/kmerdb/minimizer.py b/kmerdb/minimizer.py index b6ce9a3..1a920ef 100644 --- a/kmerdb/minimizer.py +++ b/kmerdb/minimizer.py @@ -57,3 +57,82 @@ def select_lexicographical_minimizers(seq, k, window_size, kmer_ids): return minimizers + +def minimizers_from_fasta_and_kdb(fasta_file, kdb_file, window_size): + from Bio import SeqIO + import numpy as np + from kmerdb import fileutil, config, util, minimizer + import json + metadata = None + N = None + kmer_ids = None + fasta_seqs = [] + mins = None + + fa_sfx = os.path.splitext(fasta_file)[-1] + kdb_sfx = os.path.splitext(kdb_file)[-1] + + + if kdb_sfx != ".kdb" and kdb_sfx != ".kdbg": # A filepath with invalid suffix + raise IOError("Input .kdb filepath '{0}' does not end in '.kdb'".format(kdb_file)) + elif not os.path.exists(kdb_file): + raise IOError("Input .kdb filepath '{0}' does not exist on the filesystem".format(kdb_file)) + elif fa_sfx != ".fa" and fa_sfx != ".fna" and fa_sfx != ".fasta": + raise IOError("Input .fasta filepath '{0}' does not exist on the filesystem".format(arguments.fasta)) + + with open(fasta_file, mode="r") as ifile: + for record in SeqIO.parse(ifile, "fasta"): + fasta_seqs.append(str(record.seq)) + + with fileutil.open(kdb_file, mode='r', slurp=True) as kdb_in: + metadata = kdb_in.metadata + + + kmer_ids_dtype = metadata["kmer_ids_dtype"] + N = 4**metadata["k"] + if metadata["version"] != config.VERSION: + logger.log_it("KDB version is out of date, may be incompatible with current KDBReader class", "WARNING") + kmer_ids = kdb_in.kmer_ids + + for seq in fasta_seqs: + + minimizers = minimizer.select_lexicographical_minimizers(seq, metadata['k'], window_size, kmer_ids) + + # TODO: FIXME + + mins_as_list = list(map(int, minimizers)) + + # + if mins is None: + mins=mins_as_list + else: + for kmer_is_selected, i in enumerate(mins_as_list): + if mins_as_list[i] == 1: + pass + elif kmer_is_selected == 1: + mins[i] = 1 + + return minimizers + +def make_alignment(reference_fasta_seqs, query_fasta_seqs, k): + + + reference_minimizers = select_lexicographical_minimizers() + + +def read_minimizer_kdbi_index_file(kdbi1): + """ + Read a minimizer file into memory + """ + minimizers = [] + with open(kdbi1, 'r') as minimizer_index_file: + for l in minimizer_index_file: + + line = l.strip("\n").split("\t") + + assert len(line) == 2, "Index file corrupted. Use 'kmerdb minimizer' to regenerate the index file." + kmer_id, is_minimizer = line + + if is_minimizer == 1: + minimizers.append(kmer_id) + return minimizers