Skip to content

Commit

Permalink
expose per-contig iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed Mar 22, 2024
1 parent e1d2e7a commit 11dc8f2
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 54 deletions.
97 changes: 54 additions & 43 deletions include/buckets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,13 @@ struct buckets {
return {res, contig_end};
}

uint64_t contig_length(uint64_t contig_id) const {
uint64_t length = pieces.access(contig_id + 1) - pieces.access(contig_id);
return length;
/* Return where the contig begins and ends in strings. */
std::pair<uint64_t, uint64_t> // [begin, end)
contig_offsets(const uint64_t contig_id) const {
uint64_t begin = pieces.access(contig_id);
uint64_t end = pieces.access(contig_id + 1);
assert(end > begin);
return {begin, end};
}

kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const {
Expand Down Expand Up @@ -174,69 +178,76 @@ struct buckets {
struct iterator {
iterator() {}

iterator(buckets const* ptr, uint64_t kmer_id, uint64_t k, uint64_t num_kmers)
: m_buckets(ptr), m_kmer_id(kmer_id), m_k(k), m_num_kmers(num_kmers) {
bv_it = bit_vector_iterator(m_buckets->strings, -1);
offset = m_buckets->id_to_offset(m_kmer_id, k);
auto [pos, piece_end] = m_buckets->pieces.next_geq(offset);
if (piece_end == offset) pos += 1;
pieces_it = m_buckets->pieces.at(pos);
iterator(buckets const* ptr, //
const uint64_t begin_kmer_id, const uint64_t end_kmer_id, // [begin,end)
const uint64_t k)
: m_buckets(ptr)
, m_begin_kmer_id(begin_kmer_id)
, m_end_kmer_id(end_kmer_id)
, m_k(k) //
{
m_bv_it = bit_vector_iterator(m_buckets->strings, -1);
m_offset = m_buckets->id_to_offset(m_begin_kmer_id, k);
auto [pos, piece_end] = m_buckets->pieces.next_geq(m_offset);
if (piece_end == m_offset) pos += 1;
m_pieces_it = m_buckets->pieces.at(pos);
next_piece();
ret.second.resize(k, 0);
m_ret.second.resize(m_k, 0);
}

bool has_next() const { return m_kmer_id != m_num_kmers; }
bool has_next() const { return m_begin_kmer_id != m_end_kmer_id; }

std::pair<uint64_t, std::string> next() {
if (offset == next_offset - m_k + 1) {
offset = next_offset;
if (m_offset == m_next_offset - m_k + 1) {
m_offset = m_next_offset;
next_piece();
}

while (offset != next_offset - m_k + 1) {
ret.first = m_kmer_id;
if (clear) {
util::uint_kmer_to_string(read_kmer, ret.second.data(), m_k);
while (m_offset != m_next_offset - m_k + 1) {
m_ret.first = m_begin_kmer_id;
if (m_clear) {
util::uint_kmer_to_string(m_read_kmer, m_ret.second.data(), m_k);
} else {
memmove(ret.second.data(), ret.second.data() + 1, m_k - 1);
ret.second[m_k - 1] = util::uint64_to_char(last_two_bits);
memmove(m_ret.second.data(), m_ret.second.data() + 1, m_k - 1);
m_ret.second[m_k - 1] = util::uint64_to_char(m_last_two_bits);
}
clear = false;
read_kmer >>= 2;
last_two_bits = bv_it.get_next_two_bits();
read_kmer += last_two_bits << (2 * (m_k - 1));
++m_kmer_id;
++offset;
return ret;
m_clear = false;
m_read_kmer >>= 2;
m_last_two_bits = m_bv_it.get_next_two_bits();
m_read_kmer += m_last_two_bits << (2 * (m_k - 1));
++m_begin_kmer_id;
++m_offset;
return m_ret;
}

return next();
}

private:
std::pair<uint64_t, std::string> ret;
std::pair<uint64_t, std::string> m_ret;
buckets const* m_buckets;
uint64_t m_kmer_id, m_k, m_num_kmers;
uint64_t offset;
uint64_t next_offset;
bit_vector_iterator bv_it;
ef_sequence<true>::iterator pieces_it;
uint64_t m_begin_kmer_id, m_end_kmer_id;
uint64_t m_k;
uint64_t m_offset;
uint64_t m_next_offset;
bit_vector_iterator m_bv_it;
ef_sequence<true>::iterator m_pieces_it;

kmer_t read_kmer;
uint64_t last_two_bits;
bool clear;
kmer_t m_read_kmer;
uint64_t m_last_two_bits;
bool m_clear;

void next_piece() {
bv_it.at(2 * offset);
next_offset = pieces_it.next();
assert(next_offset > offset);
read_kmer = bv_it.take(2 * m_k);
clear = true;
m_bv_it.at(2 * m_offset);
m_next_offset = m_pieces_it.next();
assert(m_next_offset > m_offset);
m_read_kmer = m_bv_it.take(2 * m_k);
m_clear = true;
}
};

iterator at(uint64_t kmer_id, uint64_t k, uint64_t size) const {
return iterator(this, kmer_id, k, size);
iterator at(const uint64_t begin_kmer_id, const uint64_t end_kmer_id, const uint64_t k) const {
return iterator(this, begin_kmer_id, end_kmer_id, k);
}

uint64_t num_bits() const {
Expand Down
3 changes: 2 additions & 1 deletion include/dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ uint64_t dictionary::weight(uint64_t kmer_id) const {

uint64_t dictionary::contig_size(uint64_t contig_id) const {
assert(contig_id < num_contigs());
uint64_t contig_length = m_buckets.contig_length(contig_id);
auto [begin, end] = m_buckets.contig_offsets(contig_id);
uint64_t contig_length = end - begin;
assert(contig_length >= m_k);
return contig_length - m_k + 1;
}
Expand Down
32 changes: 26 additions & 6 deletions include/dictionary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,42 @@ struct dictionary {
bool multiline) const;

struct iterator {
iterator(dictionary const* ptr, uint64_t kmer_id = 0) {
it = ptr->m_buckets.at(kmer_id, ptr->m_k, ptr->m_size);
iterator(dictionary const* ptr, const uint64_t begin_kmer_id, const uint64_t end_kmer_id) {
it = ptr->m_buckets.at(begin_kmer_id, end_kmer_id, ptr->m_k);
}

bool has_next() const { return it.has_next(); }
std::pair<uint64_t, std::string> next() { return it.next(); }

std::pair<uint64_t, std::string> // (kmer-id, kmer)
next() {
return it.next();
}

private:
typename buckets::iterator it;
};

iterator begin() const { return iterator(this); }
iterator begin() const { return iterator(this, 0, size()); }

iterator at(uint64_t kmer_id) const {
iterator at_kmer_id(const uint64_t kmer_id) const {
assert(kmer_id < size());
return iterator(this, kmer_id);
return iterator(this, kmer_id, size());
}

std::pair<uint64_t, uint64_t> // [begin, end)
contig_offsets(const uint64_t contig_id) const {
return m_buckets.contig_offsets(contig_id);
}

iterator at_contig_id(const uint64_t contig_id) const {
assert(contig_id < num_contigs());
auto [begin, end] = contig_offsets(contig_id);
uint64_t contig_length = end - begin; // in bases
assert(contig_length >= m_k);
uint64_t contig_size = contig_length - m_k + 1; // in kmers
uint64_t begin_kmer_id = begin - contig_id * (m_k - 1);
uint64_t end_kmer_id = begin_kmer_id + contig_size;
return iterator(this, begin_kmer_id, end_kmer_id);
}

pthash::bit_vector const& strings() const { return m_buckets.strings; }
Expand Down
3 changes: 2 additions & 1 deletion src/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ int build(int argc, char** argv) {
check_correctness_navigational_kmer_query(dict, input_filename);
check_correctness_navigational_contig_query(dict);
if (build_config.weighted) check_correctness_weights(dict, input_filename);
check_correctness_iterator(dict);
check_correctness_kmer_iterator(dict);
check_correctness_contig_iterator(dict);
}
bool bench = parser.get<bool>("bench");
if (bench) {
Expand Down
30 changes: 27 additions & 3 deletions src/check_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -442,14 +442,14 @@ bool check_dictionary(dictionary const& dict) {
return true;
}

bool check_correctness_iterator(dictionary const& dict) {
std::cout << "checking correctness of iterator..." << std::endl;
bool check_correctness_kmer_iterator(dictionary const& dict) {
std::cout << "checking correctness of kmer iterator..." << std::endl;
std::string expected_kmer(dict.k(), 0);
constexpr uint64_t runs = 3;
essentials::uniform_int_rng<uint64_t> distr(0, dict.size() - 1, essentials::get_random_seed());
for (uint64_t run = 0; run != runs; ++run) {
uint64_t from_kmer_id = distr.gen();
auto it = dict.at(from_kmer_id);
auto it = dict.at_kmer_id(from_kmer_id);
while (it.has_next()) {
auto [kmer_id, kmer] = it.next();
dict.access(kmer_id, expected_kmer.data());
Expand All @@ -468,4 +468,28 @@ bool check_correctness_iterator(dictionary const& dict) {
return true;
}

bool check_correctness_contig_iterator(dictionary const& dict) {
std::cout << "checking correctness of contig iterator..." << std::endl;
std::string expected_kmer(dict.k(), 0);
for (uint64_t contig_id = 0; contig_id != dict.num_contigs(); ++contig_id) {
auto [begin, _] = dict.contig_offsets(contig_id);
uint64_t from_kmer_id = begin - contig_id * (dict.k() - 1);
auto it = dict.at_contig_id(contig_id);
while (it.has_next()) {
auto [kmer_id, kmer] = it.next();
dict.access(kmer_id, expected_kmer.data());
if (kmer != expected_kmer or kmer_id != from_kmer_id) {
std::cout << "got (" << kmer_id << ",'" << kmer << "')";
std::cout << " but ";
std::cout << "expected (" << from_kmer_id << ",'" << expected_kmer << "')"
<< std::endl;
return false;
}
++from_kmer_id;
}
}
std::cout << "EVERYTHING OK!" << std::endl;
return true;
}

} // namespace sshash

0 comments on commit 11dc8f2

Please sign in to comment.