diff --git a/README.md b/README.md index 77f28ad..0f9bd4a 100644 --- a/README.md +++ b/README.md @@ -26,19 +26,31 @@ NOTE: Beta-stage `.bgzf` and `zlib` compatible k-mer count vectors and DeBruijn ## Summary -- [ x ] Homepage: -- [ x ] Quixart -- [ x ] Readme headr -- [ x ] OR -- [ x ] usage / off -- [ - - -k-mer counts from .fa(.gz)/.fq(.gz) sequence data can be stored in `.kdb` file format, a bgzf file similar to `.bam`. For those familiar with `.bam`, a `view` and `header` functions are provided. This file is compatible with `zlib`. +- [ x ] [Homepage:](https://matthewralston.github.io/kmerdb) +- [ x ] [Quick Start guide](https://matthewralston.github.io/kmerdb/quickstart) +- [ x ] `kmerdb usage subcommand_name` + - `profile` - Make k-mer count vectors/profiles, calculate unique k-mer counts, total k-mer counts, nullomer counts. Import to read/write NumPy arrays from profile object attributes. + - `graph` - Make a weighted edge list of kmer-to-kmer relationships, akin to a De Bruijn graph. + - `usage` - Display verbose input file/parameter and algorithm details of subcommands. + - `help` - Display verbose input file/parameter and algorithm details of subcommands. + - `view` - View .tsv count/frequency vectors with/without preamble. + - `header` - View YAML formatted header and aggregate counts + - `matrix` - Collate multiple profiles into a count matrix for dimensionality reduction, etc. + - `kmeans` - k-means clustering on a distance matrix via Scikit-learn or BioPython with kcluster distances + - `hierarchical` - hierarchical clustering on a distance matrix via BioPython with linkage choices + - `distance` - Distance matrices (from kmer count matrices) including SciPy distances, a Pearson correlation coefficient implemented in Cython, and Spearman rank correlation included as additional distances. + - `index` - Create an index file for the kmer profile (Delayed:) + - `shuf` - Shuffle a k-mer count vector/profile (Delayed:) + - `version` - Display kmerdb version number + - `citation` - Silence citation suggestion +- [ x ] `kmerdb subcommand -h|--help` + + +k-mer counts from .fa(.gz)/.fq(.gz) sequence data can be computed and stored for access to metadata and count aggregation faculties. For those familiar with `.bam`, a `view` and `header` functions are provided. This file is compatible with `zlib`. Install with `pip install kmerdb` -`kmerdb` is a Python CLI designed for k-mer counting and k-mer graph edge-lists. It addresses the ['k-mer' problem](https://en.wikipedia.org/wiki/K-mer) (substrings of length k) in a simple and performant manner. It stores the k-mer counts in a columnar format (input checksums, total and unique k-mer counts, nullomers, mononucleotide counts) with a YAML formatted metadata header in the first block of a `bgzf` formatted file. + Please see the [Quickstart guide](https://matthewralston.github.io/kmerdb/quickstart) for more information about the format, the library, and the project. @@ -48,191 +60,40 @@ Please see the [Quickstart guide](https://matthewralston.github.io/kmerdb/quicks ```bash # Usage --help option --debug mode kmerdb --help # [+ --debug mode] -kmerdb usage graph - -**** - o-O ||| -o---O ||| [|[ kmerdb ]|] -O---o ||| - O-o ||| version : v0.8.2 - O ||| - o-O ||| GitHub : https://github.com/MatthewRalston/kmerdb/issues -o---O ||| PyPI : https://pypi.org/project/kmerdb/ -O---o ||| Website : https://matthewralston.github.io/kmerdb - O-o ||| - lang : python - v : >= v3.7.4 - - package manger : pip - version : >= 24.0 - package root : /home/user/.local/share/virtualenvs/kdb-venv/lib/python3.12/site-packages/kmerdb - exe file : /home/user/.local/share/virtualenvs/kdb-venv/lib/python3.12/site-packages/kmerdb/__init__.py - - required packages : 9 - development packages : 9 - - ARGV : ['/home/user/.local/share/virtualenvs/kdb-venv/bin/kmerdb', 'usage', 'graph'] - -O---o - O-o - O - o-O -o---O -O---o - O-o - O - o-O -o---O -O---o - O-o - O - o-O -o---O - - - - -Beginning program... - - - - - [ name ] : graph - - description : create a edge list in (block) .gz format from .fasta|.fa or .fastq format. - - - - - : 4 column output : [ row idx | k-mer id node #1 | k-mer id node #2 | edge weight (adjacency count) ] - - : make a deBruijn graph, count the number of k-mer adjacencies, printing the edge list to STDOUT - - - - - +=============+====================+====================+=================================+ - < row idx | k-mer id node #1 | k-mer id node #2 | edge weight (adjacency count) > - | | | | | - | + - | - | - | - | - | - - - - - --------------------------- - - - kmerdb graph -k 12 input_1.fa [example_2.fastq] output.12.kdbg - - [-] inputs : - - Input file can .fastq (or .fa). - gzip. Output is a weighted edge list in .kdb format (gzipped .csv with YAML header) - - [-] parameters : - - uses < -k > for k-mer size, --quiet to reduce runtime, -v, -vv to control logging. -- - - - - [-] [ usage ] : kmerdb graph -k $K --quiet [input_2.fq.gz] - - - - - - - - - - - - - - - -name: arguments -type: array -items: -- name: k - type: int - value: choice of k-mer size -- name: quiet - type: flag - value: Write additional debug level information to stderr? - - - - -name: inputs -type: array -items: -- name: <.fasta|.fastq> - type: array - value: gzipped or uncompressed input .fasta or .fastq file(s) -- name: .kdbg - type: file - value: Output edge-list filepath. - - - - -name: features -type: array -items: -- name: k-mer count arrays, linear, produced as file is read through sliding window. - (Un)compressed support for .fa/.fq. - shortname: parallel faux-OP sliding window k-mer shredding - description: Sequential k-mers from the input .fq|.fa files are added to the De - Bruijn graph. In the case of secondary+ sequences in the .fa or considering NGS - (.fq) data, non-adjacent k-mers are pruned with a warning. Summary statistics - for the entire file are given for each file read, + a transparent data structure. -- name: k-mer neighbors assessed and tallied, creates a unsorted edge list, with weights - shortname: weighted undirected graph - description: an edge list of a De Bruijn graph is generated from all k-mers in the - forward direction of .fa/.fq sequences/reads. i.e. only truly neighboring k-mers - in the sequence data are added to the tally of the k-mer nodes of the de Bruijn - graph and the edges provided by the data. - -... - +kmerdb usage profile # + -# [ 3 main features: ] k-mer counts (kmerdb profile -k 12 []) 'De Bruijn' graph (kmerdb graph) [matrices, distances, and clustering!] +# [ 3 main features: ] [ 1. - k-mer counts ] # Create a [composite] profile of k-mer counts from sequence files. (.fasta|.fastq|.fa.gz|.fq.gz) -kmerdb profile -k 8 --output-name sample_1 sample_1_rep1.fq.gz [sample_1_rep2.fq.gz] -# Creates sample_1.8.kdb. --minK and --maxK options can be specified to create multiple k-mer profiles at once. -# Alternatively, can also take a plain-text samplesheet.txt with one filepath on each line. +kmerdb profile -vv -k 8 --output-name sample_1 sample_1_rep1.fq.gz [sample_1_rep2.fq.gz] +# Creates k-mer count vector/profile in sample_1.8.kdb. This is the input to other steps, including count matrix aggregation. --minK and --maxK options can be specified to create multiple k-mer profiles at once. + +# De Bruijn graphs (not a main feature yet, delayed) # Build a weighted edge list (+ node ids/counts = De Bruijn graph) -kmerdb graph -k 12 example_1.fq.gz example_2.fq.gz edges_1.kdbg +kmerdb graph -vv -k 12 example_1.fq.gz example_2.fq.gz edges_1.kdbg # View k-mer count vector -kmerdb view profile_1.8.kdb # -H for full header +kmerdb view -vv profile_1.8.kdb # -H for full header # Note: zlib compatibility #zcat profile_1.8.kdb # View header (config.py[kdb_metadata_schema#L84]) -kmerdb header profile_1.8.kdb +kmerdb header -vv profile_1.8.kdb -## Optional normalization, dim reduction, and distance matrix features: +## [ 3 main features: ] [ 2. Optional normalization, PCA/tSNE, and distance metrics ] # K-mer count matrix - Cython Pearson coefficient of correlation [ ssxy/sqrt(ssxx*ssyy) ] -kmerdb matrix pass *.8.kdb | kmerdb distance pearson STDIN +kmerdb matrix -vv from *.8.kdb | kmerdb distance pearson STDIN # -# kmerdb matrix DESeq2 *.8.kdb -# kmerdb matrix PCA *.8.kdb -# kmerdb matrix tSNE *.8.kdb -# # just makes a k-mer count matrix from k-mer count vectors. +# kmerdb matrix -vv DESeq2 *.8.kdb +# kmerdb matrix -vv PCA *.8.kdb +# kmerdb matrix -vv tSNE *.8.kdb +# # just makes a k-mer count matrix from k-mer count vectors. # # Distances on count matrices [ SciPy ] pdists + [ Cython ] Pearson correlation, scipy Spearman and scipy correlation pdist calculations are available ] @@ -242,18 +103,18 @@ kmerdb distance -h # usage: kmerdb distance [-h] [-v] [--debug] [-l LOG_FILE] [--output-delimiter OUTPUT_DELIMITER] [-p PARALLEL] [--column-names COLUMN_NAMES] [--delimiter DELIMITER] [-k K] # {braycurtis,canberra,chebyshev,cityblock,correlation,cosine,dice,euclidean,hamming,jaccard,jensenshannon,kulsinski,mahalanobis,matching,minkowski,pearson,rogerstanimotorusselrao,seuclidean,sokalmichener,sokalsneath,spearman,sqeuclidean,yule} [ ...] -# + +# [ 3 main features: ] [ 3. Clustering: k-means and hierarchical with matplotlib ] # Kmeans (sklearn, BioPython) -kmerdb kmeans -k 4 -i dist.tsv +kmerdb kmeans -vv -k 4 -i dist.tsv # BioPython Phylip tree + upgma -kmerdb hierarchical -i dist.tsv +kmerdb hierarchical -vv -i dist.tsv ``` - +`kmerdb` is a Python CLI designed for k-mer counting and k-mer graph edge-lists. It addresses the ['k-mer' problem](https://en.wikipedia.org/wiki/K-mer) (substrings of length k) in a simple and performant manner. It stores the k-mer counts in a columnar format (input checksums, total and unique k-mer counts, nullomers, mononucleotide counts) with a YAML formatted metadata header in the first block of a `bgzf` formatted file. ## Usage example @@ -370,7 +231,7 @@ Thanks to Rachel for the good memories and friendship. And Sophie too. veggies n Thanks to Yasmeen for the usual banter. Thanks to A for the newer banter. Thanks to Max, Robin, and Robert for the good memories in St. Louis. What's new? -Thanks to Fred for the good memories. +Thanks to Fred for the good memories. Hope you're on soon. Thanks to Nichole for the cookies and good memories. And your cute furballs too! Hope you're well Thanks to S for the lessons, convos, and even embarassing moments. You're kind of awesome to me. Thanks to a few friends I met in 2023 that reminded me I have a lot to learn about friendship, dating, and street smarts. diff --git a/TODO.org b/TODO.org index f5b3c14..0dba8ce 100644 --- a/TODO.org +++ b/TODO.org @@ -6,20 +6,61 @@ # .kdb files should be debrujin graph databases # The final prototype would be .bgzf format from biopython -* 7/12/24 [Roadmap] - D2 -* TODO 7/11/24 - [LIT REVIEW] -** D2 metrics, markov sequence prob review +* 8/1/24 Written Lit review, System Reconfigurations + +** Currently reconfiguring my system and redundancies + +** Making copies of my installation and configuration/install routines. Trying ubuntu 24.04 LTS version rather than Arch. Better build/configure/make predictability. + +** Current [TODO] + +*** NEXT Create kmerdb logo using GIMP +:LOGBOOK: +- State "IN-PROGRESS" from "NEXT" [2024-08-01 Thu 19:04] +:END: + +*** TODO Finish logo export + +*** Add logo to README + +*** Add logo to website + +*** + +* 7/28/24 [multiplication rule for Markov probability] +* needs to be written in documentation +** currently writen into appmap as command 11, but not fleshed out. +** + +* [TRIAGE] : vsearch align with kmerdb +** Use k-mer frequencies to rank similarity to sequences in db. +** Proceed from seed match/mismatch to full dynamic programmin smith waterman w/ affine gap penalty +** + + +* 7/16/24 NEW metadata feature for graph subcommand +** graph subcommand needs node count explicitly, (k^n, where n is proportional to fastq size in number of reads) +*** graph in m = 4^k symbols* +** [new] metadata fields: unique_kmers, total_kmers, total_nodes, total_edges, possible_edges +*** AND also printed in final stats + +* IN PROGRESS 7/11/24 - [LIT REVIEW] +** IN PROGRESS D2 metrics, markov sequence prob review *** D2 = \sum(I(A, B)) **** -*** D2s = \sum{ \frac{ (X - Xbar)(Y - Ybar) }{ \sqrt{ (X - Xbar) + (Y - Ybar) } } (the squareroot of the sum of the standardized X's is the denominator, numerator is the product of the standardized X and Y counts, then the ratio is summed) +*** D2s = \sum{ \frac{ (X - \bar{X})(Y - Ybar) }{ \sqrt{ (X - Xbar) + (Y - Ybar) } } (the squareroot of the sum of the standardized X's is the denominator, numerator is the product of the standardized X and Y counts, then the ratio is summed) **** *** D2* = \sum{ \frac{ (X - Xbar)(Y - Ybar) }{ mhat*nhat*pwX*pwY } } (w=word, hat = "adjusted"/translated = m - k, X and Y are counts from ) **** *** D2z = ( D2(A,B) - E[D2] ) / \sqrt( var(D2) ) **** -*** DELEGATED D2shepp = \sum{ \frac{ cwXi - (n-k+1)pwx * cwYi - (n-k+1)pwy }{ \sqrt{ (cwXi - (n-k+1)pwx)^{2} + (cwYi - (n-k+1)pwy)^{2}} } - CLOSED: [2024-07-12 Fri 21:49] +*** WAITING D2shepp = \sum{ \frac{ cwXi - (n-k+1)pwx * cwYi - (n-k+1)pwy }{ \sqrt{ (cwXi - (n-k+1)pwx)^{2} + (cwYi - (n-k+1)pwy)^{2}} } + :LOGBOOK: + - State "WAITING" from "DONE" [2024-08-01 Thu 18:49] + - State "DONE" from "CANCELED" [2024-08-01 Thu 18:49] + - State "CANCELED" from "DELEGATED" [2024-08-01 Thu 18:49] + :END: **** Reinert G. et al. "Alignment-free sequence comparison (1): statistics and power" J. Comput. Biol. 2003 v16 (p1615-1634) **** Bibtex format below: @article{reinert2009alignment, @@ -37,13 +78,15 @@ ** TODO core species choices *** chicken farm estuary system changes (algination, asphyxia, microbiological changes *** anti-human leaky gut syndrome changes. +**** i.e. looking at the human leaky gut syndrome, but in reverse. What are bioprotective species and niches that provide resilience to leaky-gut syndrome **** TODO chemophore SMILES and gastrotoxic footprints *** pathology of lupus or auto-immune skin condition microbiome/metagenomic changes. *** vaginal microbiome changes *** ** Perspective 1 from reivew on distance metrics ** -* 7/10/24 - okay so path 1 [ 2 reviews + cython D2 metrics ] path 2 [ 2 reviews + graph algorithm ] +* IN PROGRESS 7/10/24 - [IMPORTANT] Needs a choice [cython d2 x graph algorithm features ]: +** [Key choice needed]: 1 [ 2 reviews + cython D2 metrics ] path 2 [ 2 reviews + graph algorithm ] ** cython d2 metrics including the delta distance : |pab(A)-pab(B)| (Karlin et al, tetra,tri,di- nucleotide frequencies) ** (describe Karlin delta, algorithm to calculate) diff --git a/kmerdb/__init__.py b/kmerdb/__init__.py index 46b209d..7c0e6ef 100644 --- a/kmerdb/__init__.py +++ b/kmerdb/__init__.py @@ -726,7 +726,7 @@ def get_matrix(arguments): # logger.debug("final_df should be set as normalized") # sys.exit(1) final_df = normalized - elif arguments.method == "pass": + elif arguments.method == "from": feature = 2 step = 4 @@ -1733,9 +1733,11 @@ def make_graph(arguments): tupley = (i, node1, node2, w) tupley_dl = np.array(tupley, dtype="uint64") if arguments.quiet is False: - print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w)) + print("\t".join(list(tupley_dl))) + #print("{0}\t{1}\t{2}\t{3}".format(i, node1, node2, w)) # i, node1, node2, weight - kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w)) + kdbg_out.write("{0}\n".format("\t".join(list(tupley_dl)))) + #kdbg_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, node1, node2, w)) finally: kdbg_out._write_block(kdbg_out._buffer) kdbg_out._handle.flush() @@ -1759,6 +1761,7 @@ def make_graph(arguments): sys.stderr.write("-"*30 + "\n") sys.stderr.write("Edges in file: {0}\n".format(N)) sys.stderr.write("Non-zero weights: {0}\n".format(int(np.count_nonzero(weights)))) + sys.stderr.write("Sum of node degrees in file: {0}".format(2*N)) sys.stderr.write("\nDone\n") logger.log_it("Done printing weighted edge list to .kdbg", "INFO") @@ -1810,7 +1813,7 @@ def profile(arguments): logger.log_it("Input suffix is .fasta", "DEBUG") elif ".fastq" in arguments.input[0] or ".fq" in arguments.input[0] or ".fastq.gz" in arguments.input[0] or ".fq.gz" in arguments.input[0]: logger.log_it("Input suffix is .fastq", "DEBUG") - elif ".txt" in arguments.input[0]: + elif ".txt" in arguments.input[0] or ".tsv" in arguments.input[0]: logger.log_it("Input suffix is .txt, possibly samplesheet", "DEBUG") samplesheet = arguments.input[0] @@ -2337,7 +2340,7 @@ def cli(): matrix_parser.add_argument("-n", default=None, type=int, help="The number of dimensions to reduce with PCA or t-SNE. DEFAULT: an elbow graph will be generated if -n is not provided to help the user choose -n") matrix_parser.add_argument("--perplexity", default=5, type=int, help="The choice of the perplexity for t-SNE based dimensionality reduction") - matrix_parser.add_argument("method", choices=["PCA", "tSNE", "DESeq2", "pass", "Frequency"], default=None, help="Choice of dimensionality reduction, normalization method (DESeq2), or pass (no action)") + matrix_parser.add_argument("method", choices=["PCA", "tSNE", "DESeq2", "from"], default=None, help="Choice of dimensionality reduction, normalization method (DESeq2), or matrix-from (collate data only)") matrix_parser.add_argument("input", nargs="*", default=[], metavar="", help="Two or more .kdb files, or another count matrix in tsv/csv") matrix_parser.set_defaults(func=get_matrix) diff --git a/kmerdb/appmap.py b/kmerdb/appmap.py index eac6a3b..0b54970 100644 --- a/kmerdb/appmap.py +++ b/kmerdb/appmap.py @@ -30,10 +30,11 @@ yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict) default_logline_choices = (20, 50, 100, 200) -PINNED_ISSUES = (132, 133, 137) +PINNED_ISSUES = (140, 141, 143) PROGRAM_BANNER = """ -**** + + o-O ||| o---O ||| [|[ kmerdb ]|] O---o ||| @@ -43,10 +44,79 @@ o---O ||| PyPI : https://pypi.org/project/kmerdb/ O---o ||| Website : https://matthewralston.github.io/kmerdb O-o ||| + + + + + + + + + +# ||||||||||||||||||||||||||||||||||||||| +# [ Usage ] : ||||||||||||||| +# ||||||||||||||||||||||||||||||||||||||| + + +# Check test/data for example fasta files. + + +# ----------------------------- +# Generate k-mer count profiles +# ----------------------------- +kmerdb profile -k 12 -o profile_1 input_1.fa.gz [input_2.fq] ... +... + + +# ----------------------------- +# Merge profiles +# ----------------------------- +kmerdb matrix pass profile_1.12.kdb profile_2.12.kdb profile_3.12.kdb ... > count_matrix.tsv + +# ----------------------------- +# Generate inter-profile distances +# ----------------------------- +kmerdb distance pearson count_matrix.tsv + + +# ----------------------------- +# Pipeline form +# ----------------------------- + +kmerdb matrix pass ... | kmerdb distance pearson STDIN > pearson_correlation_matrix.tsv + +# ----------------------------- +# Okay, how about PCA, t-SNE? +# ----------------------------- +kmerdb matrix PCA -n 25 ... > 25_pseudomer_count_profiles.tsv +# kmerdb matrix tSNE -n 25 + + +# ----------------------------- +# k-means clustering? +# ----------------------------- +kmerdb kmeans -k 4 --distance e -i input_distance_matrix.tsv # Using 'e' for Euclidean distance with Biopython. Check the source, Biopython RTD, and sklearn RTD. +# Produces + +# ----------------------------- +# okay, now straight-up hierarchical clustering: +# ----------------------------- +kmerdb hierarchical -i input_distance_matrix.tsv --method complete # Uses complete linkage + + + """.format(config.VERSION) INTERPRETER = " lang : python\n" # hardcoded +# print_program_header +# sys.stderr.write(PROGRAM_BANNER) +# sys.stderr.write(INTERPRETER) +# sys.stderr.write(self.VERSION_HARDCODED) +# sys.stderr.write(self.PACKAGE_MANAGER) + + + GITHUB_LOGO = """ .--------------------------------------------------. | .mmMMMMMMMMMMMMMmm. | @@ -457,7 +527,7 @@ | """ -command_2_parameters = "uses < -k > for k-mer size, --quiet to reduce runtime, -v, -vv to control logging. --" +command_2_parameters = "uses < -k > for k-mer size, --quiet to reduce runtime, -v, -vv to control logging." command_2_inputs = "Input file can .fastq (or .fa). - gzip. Output is a weighted edge list in .kdb format (gzipped .csv with YAML header)" command_2_usage = "kmerdb graph -k $K --quiet [input_2.fq.gz] " @@ -1297,7 +1367,7 @@ command_7_description = "K-means clustering with biopython or scikit-learn" -command_7_description_long = "Produces eblow graph if k is not determined a-priori. Uses matplotlib for graphics." +command_7_description_long = "Produces eblow graph if k is not determined a-priori. Uses matplotlib for graphics. Prints {0} and {1} as primary outputs, in addition to the 'kmeans_elbow_graph.png' which is printed if no k is supplied.".format(config.pca_variance_fig_filepath, config.kmeans_clustering_fig_filepath) command_7_parameters = "Use -k to control clustering with k-means. Choice of 'sklearn' or 'biopython' k-means clustering. Choice of distance metrics offered by kcluster" command_7_inputs = "Input is a .tsv file. Use STDIN to read input from standard input".format(config.VERSION) command_7_usage = "kmerdb kmeans -k 5 -i sklearn" @@ -1439,7 +1509,7 @@ command_8_description = "Hierarchical clustering with biopython" -command_8_description_long = "Uses matplotlib for graphics." +command_8_description_long = "Uses matplotlib for graphics. Creates {0} and {1} as primary outputs.".format(config.hierarchical_clustering_dendrogram_fig_filepath, config.upgma_tree_phylip) command_8_parameters = "-m|--method determines linkage fitting (see --help for details)" command_8_inputs = "Input is a .tsv file. Use STDIN to read input from standard input".format(config.VERSION) command_8_usage = "kmerdb hierarchical -m ward -i " @@ -1867,6 +1937,178 @@ }) + + + + + + + +command_11_description = "K-mer Markov sequence probability feature (Deprecated)" +command_11_description_long = """ + Uses conditional probabilities and multiplication rule along with Markov model of sequence to use likelihood/odds ratios to test likelihood of the sequence(s) given the inputs. Unadjusted for multiple-hypothesis testing. + +Conditional probability : + +P(X|Y) = P(XY)/P(Y) + + +Multiplication rule : + +P(X) = P(an|an-1,an-2,...a1) a { N , X = an-1,an-2,...,a1 + + + + +""" +command_11_parameters = "inputs may be one or more fasta files, and the .kdb files needed for the model's output probability." +command_11_inputs = "Input is a v{0} .kdb count vector file" +command_11_usage = "kmerdb prob --db [--db kmer_count_vector_2.kdb] [query_sequences_2.fasta.gz]" + + + +COMMAND_11_BANNER = """ + + + + + + + + + + + + +[--------------------------------------------------------------------------------------] + + + + + + + + + [ n a m e ] : - {0} + + description : {1} + +{2} + + + + +-------------------------- + + kmerdb prob <--db input.kdb> sequences.fasta[.gz] + + [-] inputs : + + {3} + + [-] parameters : + + {4} + + + + [-] [ usage ] : {5} + + + + + + + + + + + + + + + + + + +[--------------------------------------------------------------------------------------] +""".format(command_11_name, command_11_description, command_11_description_long, command_11_inputs, command_11_parameters, command_11_usage) + + + +COMMAND_11_PARAMS = OrderedDict({ + "name": "arguments", + "type": "array", + "items": [ + { + "name": "K-mer database file.", + "type": "file", + "value": "[NOTE] : multiple may be specified. | The k-mer frequencies to use in the Markov model probability calculations, and likelihood/odds-ratio tests" + } + ] +}) + + +COMMAND_11_INPUTS = OrderedDict({ + "name": "inputs", + "type": "array", + "items": [ + { + "name": "Input .fa|.fasta|.fa.gz files.", + "type": "array", + "value": "File(s) to query against the k-mer database." + } + + ] +}) + +COMMAND_11_FEATURES = OrderedDict({ + "name": "features", + "type": "array", + "items": [ + OrderedDict({ + "name": "[FIXME! 7/28/24 Hi fross, glhf!]", + "shortname": "", + "description" : "" + }), + OrderedDict({ + "name": "", + "shortname": "", + "description": "(Deprecated)" + }) + ] +}) + +COMMAND_11_STEPS = OrderedDict({ + "name": "steps", + "type": "array", + "items": [ + OrderedDict({ + "name": "", + "shortname": "", + "description": "(uhhhh...)", + }), + OrderedDict({ + "name": "", + "shortname": "Shuffle k-mer counts", + "description": "(Deprecated)" + }) + + ] + +}) + + + + + +################################################### + +# F i n a l c o m m a n d a g g r e g a t e + +################################################### + + ALL_PARAMS = { "profile": COMMAND_1_PARAMS["items"], "graph": COMMAND_2_PARAMS["items"], @@ -2354,7 +2596,7 @@ def print_github_block(self): sys.stderr.write(THREE_LINES) - sys.stderr.write(DNA_SPACER_1) + sys.stderr.write(DNA_SPACER_lol) sys.stderr.write(THREE_LINES) diff --git a/kmerdb/config.py b/kmerdb/config.py index ea4971e..f9fe041 100644 --- a/kmerdb/config.py +++ b/kmerdb/config.py @@ -17,7 +17,7 @@ -VERSION="0.8.5" +VERSION="0.8.6" REQUIRES_PYTHON="3.7.4" header_delimiter = "\n" + ("="*24) + "\n" @@ -231,7 +231,7 @@ kmeans_clustering_fig_filepath = "kmeans_clustering_of_kmer_profiles.png" #ecopy_rarefaction_fig_filepath = "ecopy_rarefaction_curve.png" hierarchical_clustering_dendrogram_fig_filepath = "dendrogram.png" -spearman_upgma_tree_phy = "kdb_spearman_upgma_tree.phyloxml" +upgma_tree_phylip = "kdb_upgma_tree.phyloxml" # files = (pca_variance_fig_filepath, kmeans_elbow_graph_fig_filepath, kmeans_clustering_fig_filepath, ecopy_rarefaction_fig_filepath, hierarchical_clustering_dendrogram_fig_filepath) ####################################################### diff --git a/kmerdb/kmer.py b/kmerdb/kmer.py index 7c7f661..b485843 100644 --- a/kmerdb/kmer.py +++ b/kmerdb/kmer.py @@ -84,7 +84,7 @@ ############################# class Kmers: - """A wrapper class to pass variables through the multiprocessing pool + """A wrapper class to pass variables through the multiprocessing pool. Methods in this class are (c) Matthew Ralston as above/throughout. That said, the k-mer bit-shifting trick :ivar k: The choice of k to shred with :ivar strand_specific: Include k-mers from forward strand only (TO BE DEPRECATED) @@ -343,6 +343,17 @@ def kmer_to_id(s): Therefore, this method does not need to be wrapped in the k-mer class + Acknowledgements for the 'idx = idx << 2' bit-shifting trick goes to the authors of kPAL. + + @article{anvar2014determining, + title={Determining the quality and complexity of next-generation sequencing data without a reference genome}, + author={Anvar, Seyed Yahya and Khachatryan, Lusine and Vermaat, Martijn and van Galen, Michiel and Pulyakhina, Irina and Ariyurek, Yavuz and Kraaijeveld, Ken and den Dunnen, Johan T and de Knijff, Peter and ’t Hoen, Peter AC and others}, + journal={Genome biology}, + volume={15}, + pages={1--15}, + year={2014}, + publisher={Springer} + } :param s: The input k-mer as string diff --git a/pyproject.toml b/pyproject.toml index 0519881..1a63d50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ build-backend = "setuptools.build_meta" [project] name = "kmerdb" -version = "0.8.5" +version = "0.8.6" description = "Yet another correction to the 'yet another correction to just a k-mer counter...'" readme = "README.md" authors = [{name="Matt Ralston ", email="mralston.development@gmail.com"}] @@ -69,7 +69,7 @@ classifiers = [ #"Topic :: Software Development :: Assemblers" keywords = ["bioinformatics", "fastq", "fasta", "k-mer", "kmer"] dependencies = [ - "numpy>=1.21.2", + "numpy>=2.0.0", "PyYAML>=6.0.1", # was before the influence # jsonschema>=4.18.0a4 "jsonschema>=4.17.3", # what was going on here? inflection points. and ms tori @@ -77,7 +77,7 @@ dependencies = [ "Cython>=3.0.8", "biopython>=1.81", "scipy>=1.11.4", - "scikit-learn==1.2.2", + "scikit-learn==1.5.1", "matplotlib>=3.5.3", "pandas>=2.2.2", "setuptools>=69.2.0", diff --git a/setup.py b/setup.py index 4c6da37..4c2525e 100644 --- a/setup.py +++ b/setup.py @@ -109,12 +109,12 @@ def can_import(module_name): DESCRIPTION = 'Yet another kmer library for Python' long_description = 'See README.md for details' URL = 'https://github.com/MatthewRalston/kmerdb' -CURRENT_RELEASE = "https://github.com/MatthewRalston/kmerdb/archive/v0.8.4.tar.gz" +CURRENT_RELEASE = "https://github.com/MatthewRalston/kmerdb/archive/v0.8.6.tar.gz" EMAIL = 'mralston.development@gmail.com' AUTHOR = 'Matt Ralston' #REQUIRES_PYTHON = ">=3.7.4" REQUIRES_PYTHON = '>=3.12.2' -VERSION = "0.8.5" +VERSION = "0.8.6" KEYWORDS = ["bioinformatics", "fastq", "fasta", "k-mer", "kmer"] CLASSIFIERS = [ "Development Status :: 1 - Planning",