From 05d61277f1531d4dc6d6de26c37536979f579896 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 9 Jul 2024 12:22:50 +0200 Subject: [PATCH] print bits/kmer using close-form formula --- include/builder/build.cpp | 7 ++++++- include/info.cpp | 43 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/builder/build.cpp b/include/builder/build.cpp index 7f503c1..0cec24b 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.cpp @@ -24,7 +24,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) + " but got m = " + std::to_string(build_config.m)); } - if (build_config.m > build_config.k) throw std::runtime_error("m must be < k"); + if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k"); if (build_config.l >= constants::max_l) { throw std::runtime_error("l must be < " + std::to_string(constants::max_l)); } @@ -74,6 +74,11 @@ void dictionary::build(std::string const& filename, build_configuration const& b mm::file_source input(data.minimizers.get_minimizers_filename(), mm::advice::sequential); minimizers_tuples_iterator iterator(input.data(), input.data() + input.size()); + + if (build_config.verbose) { + std::cout << "num_minimizers " << data.minimizers.num_minimizers() << std::endl; + } + m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config); input.close(); } diff --git a/include/info.cpp b/include/info.cpp index c1fcee1..fa3069e 100644 --- a/include/info.cpp +++ b/include/info.cpp @@ -23,13 +23,51 @@ uint64_t skew_index::print_info() const { return num_kmers_in_skew_index; } +double bits_per_kmer_formula(uint64_t k, /* kmer length */ + uint64_t m, /* minimizer length */ + uint64_t n, /* num. kmers */ + uint64_t M) /* num. strings in SPSS */ +{ + /* + Caveats: + 1. we assume an alphabet of size 4 + 2. this assumes a random minimizer scheme, so num. super-kmers is ~ 2n/(k-m+2) + 3. we neglect lower order terms and skew index space + 4. no canonical parsing + */ + + assert(k > 0); + assert(k >= m); + + const uint64_t N = n + M * (k - 1); // num. symbols in SPSS + + // double num_minimizers = (2.0 * n) / (k - m + 2); // not distinct, hence num. of super-kmers + // std::cout << "num_minimizers = " << num_minimizers << std::endl; + // std::cout << "minimizers: " << (3.0 * num_minimizers) / n << " [bits/kmer]" << std::endl; + // std::cout << "pieces: " << (M * (2.0 + std::ceil(std::log2(static_cast(N) / M)))) / n + // << " [bits/kmer]" << std::endl; + // std::cout << "num_super_kmers_before_bucket: " << (2.0 * num_minimizers) / n << " [bits/kmer] + // " + // << std::endl; + // std::cout << "offsets: " << (std::ceil(std::log2(N)) * num_minimizers) / n << " [bits/kmer]" + // << std::endl; + // std::cout << "strings: " << (2.0 * N) / n << " [bits/kmer]" << std::endl; + + double num_bits = 2 * n * (1.0 + (5.0 + std::ceil(std::log2(N))) / (k - m + 2)) + + M * (2 * k + std::ceil(std::log2(static_cast(n) / M + k - 1))); + + return num_bits / n; +} + void dictionary::print_space_breakdown() const { const uint64_t num_bytes = (num_bits() + 7) / 8; std::cout << "total index size: " << num_bytes << " [B] -- " << essentials::convert(num_bytes, essentials::MB) << " [MB]" << '\n'; std::cout << "SPACE BREAKDOWN:\n"; std::cout << " minimizers: " << static_cast(m_minimizers.num_bits()) / size() - << " [bits/kmer]\n"; + << " [bits/kmer] (" + << static_cast(m_minimizers.num_bits()) / m_minimizers.size() + << " [bits/key])\n"; std::cout << " pieces: " << static_cast(m_buckets.pieces.num_bits()) / size() << " [bits/kmer]\n"; std::cout << " num_super_kmers_before_bucket: " @@ -47,6 +85,9 @@ void dictionary::print_space_breakdown() const { std::cout << " --------------\n"; std::cout << " total: " << static_cast(num_bits()) / size() << " [bits/kmer]" << std::endl; + + std::cout << " Close-form formula: " << bits_per_kmer_formula(k(), m(), size(), num_contigs()) + << " [bits/kmer]" << std::endl; } void dictionary::print_info() const {