Merge pull request #48 from jermp/master

bring the 'submodule-integration-for-piscem' branch up to date with master
jermp · Jul 26, 2024 · dc29715 · dc29715
2 parents 277254e + fdfa686
commit dc29715
Show file tree

Hide file tree

Showing 31 changed files with 453 additions and 276 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,6 +23,13 @@ if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -msse4.2") # for hardware popcount and pdep
 endif()
 
+if (SSHASH_USE_MAX_KMER_LENGTH_63)
+  MESSAGE(STATUS "SSHash uses a maximum kmer length of 63")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSSHASH_USE_MAX_KMER_LENGTH_63")
+else()
+  MESSAGE(STATUS "SSHash uses a maximum kmer length of 31")
+endif()
+
 if (SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING)
   MESSAGE(STATUS "SSHash maps {'A','a'}->0, {'C','c'}->1, {'G','g'}->2, and {'T','t'}->3")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING")
@@ -49,6 +56,10 @@ if (UNIX)
 
 endif()
 
+include_directories(.) # all include paths relative to parent directory
+include_directories(external/pthash)
+include_directories(external/pthash/external/essentials/include)
+
 set(Z_LIB_SOURCES
   include/gz/zip_stream.cpp
 )

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ This is a compressed dictionary data structure for k-mers
 The data structure is described in the following papers:
 
 * [Sparse and Skew Hashing of K-Mers](https://doi.org/10.1093/bioinformatics/btac245) [1]
-* [On Weighted K-Mers Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2) [2,3]
+* [On Weighted K-Mer Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2) [2,3]
 
 **Please, cite these papers if you use SSHash.**
 
@@ -106,6 +106,12 @@ If you want to use the "traditional" encoding
 for compatibility issues with other software, then
 compile SSHash with the flag `-DSSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING=On`.
 
+### K-mer Length
+
+By default, SSHash uses a maximum k-mer length of 31.
+If you want to support k-mer lengths up to (and including) 63,
+compile the library with the flag `-DSSHASH_USE_MAX_KMER_LENGTH_63=On`.
+
 Dependencies
 ------------
 
@@ -151,7 +157,7 @@ Examples
 --------
 
 For the examples, we are going to use some collections
-of *stitched unitigs* from the directory `../data/unitigs_stitched`.
+of *stitched unitigs* from the directory `data/unitigs_stitched`.
 
 **Important note:** The value of k used during the formation of the unitigs
 is indicated in the name of each file and the dictionaries
@@ -161,7 +167,7 @@ For example, `data/unitigs_stitched/ecoli4_k31_ust.fa.gz` indicates the value k
 
 For all the examples below, we are going to use k = 31.
 
-(The subdirectory `../data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.)
+(The directory `data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.)
 
 In the section [Input Files](#input-files), we explain how
 such collections of stitched unitigs can be obtained from raw FASTA files.
@@ -210,26 +216,11 @@ Below a comparison between the dictionary built in Example 2 (not canonical)
 and the one just built (Example 3, canonical).
 
     ./sshash query -i salmonella_100.index -q ../data/queries/SRR5833294.10K.fastq.gz
-    index size: 10.3981 [MB] (6.36232 [bits/kmer])
-    ==== query report:
-    num_kmers = 460000
-    num_positive_kmers = 46 (0.01%)
-    num_searches = 42/46 (91.3043%)
-    num_extensions = 4/46 (8.69565%)
-    elapsed = 229.159 millisec / 0.229159 sec / 0.00381932 min / 498.172 ns/kmer
 
     ./sshash query -i salmonella_100.canon.index -q ../data/queries/SRR5833294.10K.fastq.gz
-    index size: 11.0657 [MB] (6.77083 [bits/kmer])
-    ==== query report:
-    num_kmers = 460000
-    num_positive_kmers = 46 (0.01%)
-    num_searches = 42/46 (91.3043%)
-    num_extensions = 4/46 (8.69565%)
-    elapsed = 107.911 millisec / 0.107911 sec / 0.00179852 min / 234.589 ns/kmer
 
-We see that the canonical dictionary is twice as fast as the regular dictionary
-for low-hit workloads,
-even on this tiny example, for only +0.4 bits/k-mer.
+The canonical dictionary can be twice as fast as the regular dictionary
+for low-hit workloads, even on this tiny example, for only +0.4 bits/k-mer.
 
 ### Example 4
 
@@ -374,6 +365,6 @@ Author
 References
 -----
 * [1] Giulio Ermanno Pibiri. [Sparse and Skew Hashing of K-Mers](https://doi.org/10.1093/bioinformatics/btac245). Bioinformatics. 2022.
-* [2] Giulio Ermanno Pibiri. [On Weighted K-Mers Dictionaries](https://drops.dagstuhl.de/opus/volltexte/2022/17043/). International Workshop on Algorithms in Bioinformatics (WABI). 2022.
-* [3] Giulio Ermanno Pibiri. [On Weighted K-Mers Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2). Algorithms for Molecular Biology (ALGOMB). 2023.
+* [2] Giulio Ermanno Pibiri. [On Weighted K-Mer Dictionaries](https://drops.dagstuhl.de/opus/volltexte/2022/17043/). International Workshop on Algorithms in Bioinformatics (WABI). 2022.
+* [3] Giulio Ermanno Pibiri. [On Weighted K-Mer Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2). Algorithms for Molecular Biology (ALGOMB). 2023.
 * [4] Schmidt, S., Khan, S., Alanko, J., Pibiri, G. E., and Tomescu, A. I. [Matchtigs: minimum plain text representation of k-mer sets](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-023-02968-z). Genome Biology 24, 136. 2023.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -30,7 +30,7 @@ The query times are relative to the following configuration:
 
 ### Space in bits/kmer
 
-| Dictionary |elegans ||| cod   ||| kesterl ||| human |||
+| Dictionary |elegans ||| cod   ||| kestrel ||| human |||
 |:------------------|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:------:|:-----:|:-------:|:-----:|
 |                   | k=31 | k=47 | k=63 | k=31 | k=47 | k=63 | k=31 | k=47 | k=63 | k=31 | k=47 | k=63 |
 | SSHash, **regular**   | 5.86 | 4.29 | 3.51 | 7.84 | 5.17 | 4.26 | 7.53 | 4.67 | 3.76 | 8.70 | 5.65 | 4.64 |

diff --git a/external/pthash b/external/pthash
diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../external/pthash/include/encoders/bit_vector.hpp"
+#include "external/pthash/include/encoders/bit_vector.hpp"
 #include "util.hpp"
 
 namespace sshash {

diff --git a/include/buckets.hpp b/include/buckets.hpp
@@ -45,9 +45,13 @@ struct buckets {
         return {res, contig_end};
     }
 
-    uint64_t contig_length(uint64_t contig_id) const {
-        uint64_t length = pieces.access(contig_id + 1) - pieces.access(contig_id);
-        return length;
+    /* Return where the contig begins and ends in strings. */
+    std::pair<uint64_t, uint64_t>  // [begin, end)
+    contig_offsets(const uint64_t contig_id) const {
+        uint64_t begin = pieces.access(contig_id);
+        uint64_t end = pieces.access(contig_id + 1);
+        assert(end > begin);
+        return {begin, end};
     }
 
     kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const {
@@ -174,82 +178,91 @@ struct buckets {
     struct iterator {
         iterator() {}
 
-        iterator(buckets const* ptr, uint64_t kmer_id, uint64_t k, uint64_t num_kmers)
-            : m_buckets(ptr), m_kmer_id(kmer_id), m_k(k), m_num_kmers(num_kmers) {
-            bv_it = bit_vector_iterator(m_buckets->strings, -1);
-            offset = m_buckets->id_to_offset(m_kmer_id, k);
-            auto [pos, piece_end] = m_buckets->pieces.next_geq(offset);
-            if (piece_end == offset) pos += 1;
-            pieces_it = m_buckets->pieces.at(pos);
+        iterator(buckets const* ptr,                                        //
+                 const uint64_t begin_kmer_id, const uint64_t end_kmer_id,  // [begin,end)
+                 const uint64_t k)
+            : m_buckets(ptr)
+            , m_begin_kmer_id(begin_kmer_id)
+            , m_end_kmer_id(end_kmer_id)
+            , m_k(k)  //
+        {
+            m_bv_it = bit_vector_iterator(m_buckets->strings, -1);
+            m_offset = m_buckets->id_to_offset(m_begin_kmer_id, k);
+            auto [pos, piece_end] = m_buckets->pieces.next_geq(m_offset);
+            if (piece_end == m_offset) pos += 1;
+            m_pieces_it = m_buckets->pieces.at(pos);
             next_piece();
-            ret.second.resize(k, 0);
+            m_ret.second.resize(m_k, 0);
         }
 
-        bool has_next() const { return m_kmer_id != m_num_kmers; }
+        bool has_next() const { return m_begin_kmer_id != m_end_kmer_id; }
 
         std::pair<uint64_t, std::string> next() {
-            if (offset == next_offset - m_k + 1) {
-                offset = next_offset;
+            if (m_offset == m_next_offset - m_k + 1) {
+                m_offset = m_next_offset;
                 next_piece();
             }
 
-            while (offset != next_offset - m_k + 1) {
-                ret.first = m_kmer_id;
-                if (clear) {
-                    util::uint_kmer_to_string(read_kmer, ret.second.data(), m_k);
+            while (m_offset != m_next_offset - m_k + 1) {
+                m_ret.first = m_begin_kmer_id;
+                if (m_clear) {
+                    util::uint_kmer_to_string(m_read_kmer, m_ret.second.data(), m_k);
                 } else {
-                    memmove(ret.second.data(), ret.second.data() + 1, m_k - 1);
-                    ret.second[m_k - 1] = util::uint64_to_char(last_two_bits);
+                    memmove(m_ret.second.data(), m_ret.second.data() + 1, m_k - 1);
+                    m_ret.second[m_k - 1] = util::uint64_to_char(m_last_two_bits);
                 }
-                clear = false;
-                read_kmer >>= 2;
-                last_two_bits = bv_it.get_next_two_bits();
-                read_kmer += last_two_bits << (2 * (m_k - 1));
-                ++m_kmer_id;
-                ++offset;
-                return ret;
+                m_clear = false;
+                m_read_kmer >>= 2;
+                m_last_two_bits = m_bv_it.get_next_two_bits();
+                m_read_kmer += m_last_two_bits << (2 * (m_k - 1));
+                ++m_begin_kmer_id;
+                ++m_offset;
+                return m_ret;
             }
 
             return next();
         }
 
     private:
-        std::pair<uint64_t, std::string> ret;
+        std::pair<uint64_t, std::string> m_ret;
         buckets const* m_buckets;
-        uint64_t m_kmer_id, m_k, m_num_kmers;
-        uint64_t offset;
-        uint64_t next_offset;
-        bit_vector_iterator bv_it;
-        ef_sequence<true>::iterator pieces_it;
+        uint64_t m_begin_kmer_id, m_end_kmer_id;
+        uint64_t m_k;
+        uint64_t m_offset;
+        uint64_t m_next_offset;
+        bit_vector_iterator m_bv_it;
+        ef_sequence<true>::iterator m_pieces_it;
 
-        kmer_t read_kmer;
-        uint64_t last_two_bits;
-        bool clear;
+        kmer_t m_read_kmer;
+        uint64_t m_last_two_bits;
+        bool m_clear;
 
         void next_piece() {
-            bv_it.at(2 * offset);
-            next_offset = pieces_it.next();
-            assert(next_offset > offset);
-            read_kmer = bv_it.take(2 * m_k);
-            clear = true;
+            m_bv_it.at(2 * m_offset);
+            m_next_offset = m_pieces_it.next();
+            assert(m_next_offset > m_offset);
+            m_read_kmer = m_bv_it.take(2 * m_k);
+            m_clear = true;
         }
     };
 
-    iterator at(uint64_t kmer_id, uint64_t k, uint64_t size) const {
-        return iterator(this, kmer_id, k, size);
+    iterator at(const uint64_t begin_kmer_id, const uint64_t end_kmer_id, const uint64_t k) const {
+        return iterator(this, begin_kmer_id, end_kmer_id, k);
     }
 
     uint64_t num_bits() const {
         return pieces.num_bits() + num_super_kmers_before_bucket.num_bits() +
                8 * (offsets.bytes() + strings.bytes());
     }
 
+    template <typename Visitor>
+    void visit(Visitor& visitor) const {
+        visit_impl(visitor, *this);
+    }
+
     template <typename Visitor>
     void visit(Visitor& visitor) {
-        visitor.visit(pieces);
-        visitor.visit(num_super_kmers_before_bucket);
-        visitor.visit(offsets);
-        visitor.visit(strings);
+        visit_impl(visitor, *this);
     }
 
     ef_sequence<true> pieces;
@@ -258,6 +271,13 @@ struct buckets {
     pthash::bit_vector strings;
 
 private:
+    template <typename Visitor, typename T>
+    static void visit_impl(Visitor& visitor, T&& t) {
+        visitor.visit(t.pieces);
+        visitor.visit(t.num_super_kmers_before_bucket);
+        visitor.visit(t.offsets);
+        visitor.visit(t.strings);
+    }
     bool is_valid(lookup_result res) const {
         return (res.contig_size != constants::invalid_uint64 and
                 res.kmer_id_in_contig < res.contig_size) and

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
@@ -1,5 +1,5 @@
-#include "../dictionary.hpp"
-#include "../../external/pthash/external/essentials/include/essentials.hpp"
+#include "include/dictionary.hpp"
+#include "external/pthash/external/essentials/include/essentials.hpp"
 #include "util.hpp"
 
 /** build steps **/
@@ -24,7 +24,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
         throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) +
                                  " but got m = " + std::to_string(build_config.m));
     }
-    if (build_config.m > build_config.k) throw std::runtime_error("m must be < k");
+    if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k");
     if (build_config.l >= constants::max_l) {
         throw std::runtime_error("l must be < " + std::to_string(constants::max_l));
     }
@@ -74,6 +74,11 @@ void dictionary::build(std::string const& filename, build_configuration const& b
         mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
                                                mm::advice::sequential);
         minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());
+
+        if (build_config.verbose) {
+            std::cout << "num_minimizers " << data.minimizers.num_minimizers() << std::endl;
+        }
+
         m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config);
         input.close();
     }

diff --git a/include/builder/build_index.hpp b/include/builder/build_index.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "file_merging_iterator.hpp"
-#include "../buckets_statistics.hpp"
+#include "include/buckets_statistics.hpp"
 
 namespace sshash {
+49 −41		CMakeLists.txt
+6 −2		README.md
+1 −1		external/essentials
+4 −5		include/builders/external_memory_builder_partitioned_phf.hpp
+6 −7		include/builders/external_memory_builder_single_phf.hpp
+17 −14		include/builders/internal_memory_builder_partitioned_phf.hpp
+49 −47		include/builders/internal_memory_builder_single_phf.hpp
+4 −4		include/builders/search.hpp
+23 −2		include/builders/util.hpp
+14 −5		include/encoders/bit_vector.hpp
+14 −5		include/encoders/compact_vector.hpp
+16 −6		include/encoders/darray.hpp
+15 −6		include/encoders/ef_sequence.hpp
+48 −10		include/encoders/encoders.hpp
+2 −2		include/encoders/sdc_sequence.hpp
+3 −3		include/encoders/util.hpp
+50 −11		include/partitioned_phf.hpp
+3 −3		include/pthash.hpp
+32 −12		include/single_phf.hpp
+32 −9		include/utils/bucketers.hpp
+2 −2		include/utils/util.hpp
+3 −3		src/build.cpp
+6 −5		src/example.cpp
+2 −13		src/util.hpp
+3 −3		test/common.hpp