Skip to content

Commit

Permalink
[FEATURE] search result struct (#1706)
Browse files Browse the repository at this point in the history
* [MISC] Unify cursor.locate() return value.

* [FEATURE] Add seqan3::search_result.

* [MISC,TEST] Incooporate seqan3::search_result into search algorithm and adapt tests.

* [DOC] Adapt tutorial to seqan3::search_result
  • Loading branch information
smehringer authored May 29, 2020
1 parent 69b687b commit 4fcbeb8
Show file tree
Hide file tree
Showing 29 changed files with 651 additions and 411 deletions.
10 changes: 5 additions & 5 deletions doc/tutorial/read_mapper/read_mapper_step2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ void map_reads(std::filesystem::path const & query_path,
iarchive(index);
}

seqan3::sequence_file_input query_in{query_path};
seqan3::sequence_file_input query_file_in{query_path};

seqan3::configuration const search_config = seqan3::search_cfg::max_error{seqan3::search_cfg::total{errors}} |
seqan3::search_cfg::hit_all_best;

for (auto & [query, id, qual] : query_in | seqan3::views::take(20))
for (auto && record : query_file_in | seqan3::views::take(20))
{
auto positions = search(query, index, search_config);
seqan3::debug_stream << "id: " << id << '\n';
seqan3::debug_stream << "positions: " << positions << '\n';
seqan3::debug_stream << "Hits:" << '\n';
for (auto && result : search(seqan3::get<seqan3::field::seq>(record), index, search_config))
seqan3::debug_stream << result << '\n';
seqan3::debug_stream << "======================" << '\n';
}
(void) sam_path; // prevent unused parameter warning
Expand Down
13 changes: 7 additions & 6 deletions doc/tutorial/read_mapper/read_mapper_step3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ void map_reads(std::filesystem::path const & query_path,
iarchive(index);
}

seqan3::sequence_file_input query_in{query_path};
seqan3::sequence_file_input query_file_in{query_path};

seqan3::configuration const search_config = seqan3::search_cfg::max_error{seqan3::search_cfg::total{errors}} |
seqan3::search_cfg::hit_all_best;
Expand All @@ -57,17 +57,18 @@ void map_reads(std::filesystem::path const & query_path,
seqan3::align_cfg::result{seqan3::with_alignment};
//! [alignment_config]

for (auto & [query, id, qual] : query_in | seqan3::views::take(20))
for (auto && record : query_file_in | seqan3::views::take(20))
{
for (auto & [query_idx, text_pos] : search(query, index, search_config))
auto & query = seqan3::get<seqan3::field::seq>(record);
for (auto && result : search(query, index, search_config))
{
size_t start = text_pos.second ? text_pos.second - 1 : 0;
std::span text_view{std::data(storage.seqs[text_pos.first]) + start, query.size() + 1};
size_t start = result.reference_begin_pos() ? result.reference_begin_pos() - 1 : 0;
std::span text_view{std::data(storage.seqs[result.reference_id()]) + start, query.size() + 1};

for (auto && alignment : seqan3::align_pairwise(std::tie(text_view, query), align_config))
{
auto && [aligned_database, aligned_query] = alignment.alignment();
seqan3::debug_stream << "id: " << id << '\n';
seqan3::debug_stream << "id: " << seqan3::get<seqan3::field::id>(record) << '\n';
seqan3::debug_stream << "score: " << alignment.score() << '\n';
seqan3::debug_stream << "database: " << aligned_database << '\n';
seqan3::debug_stream << "query: " << aligned_query << '\n';
Expand Down
19 changes: 13 additions & 6 deletions doc/tutorial/read_mapper/read_mapper_step4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ void map_reads(std::filesystem::path const & query_path,
iarchive(index);
}

seqan3::sequence_file_input query_in{query_path};
seqan3::sequence_file_input query_file_in{query_path};

//! [alignment_file_output]
seqan3::alignment_file_output sam_out{sam_path, seqan3::fields<seqan3::field::seq,
Expand All @@ -65,20 +65,27 @@ void map_reads(std::filesystem::path const & query_path,
seqan3::align_cfg::aligned_ends{seqan3::free_ends_first} |
seqan3::align_cfg::result{seqan3::with_alignment};

for (auto & [query, id, qual] : query_in)
for (auto && record : query_file_in)
{
for (auto & [query_idx, text_pos] : search(query, index, search_config))
auto & query = seqan3::get<seqan3::field::seq>(record);
for (auto && result : search(query, index, search_config))
{
size_t start = text_pos.second ? text_pos.second - 1 : 0;
std::span text_view{std::data(storage.seqs[text_pos.first]) + start, query.size() + 1};
size_t start = result.reference_begin_pos() ? result.reference_begin_pos() - 1 : 0;
std::span text_view{std::data(storage.seqs[result.reference_id()]) + start, query.size() + 1};

for (auto && alignment : seqan3::align_pairwise(std::tie(text_view, query), align_config))
{
auto aligned_seq = alignment.alignment();
size_t ref_offset = alignment.front_coordinate().first + 2 + start;
size_t map_qual = 60u + alignment.score();

sam_out.emplace_back(query, id, storage.ids[text_pos.first], ref_offset, aligned_seq, qual, map_qual);
sam_out.emplace_back(query,
seqan3::get<seqan3::field::id>(record),
storage.ids[result.reference_id()],
ref_offset,
aligned_seq,
seqan3::get<seqan3::field::qual>(record),
map_qual);
}
}
}
Expand Down
37 changes: 21 additions & 16 deletions doc/tutorial/search/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ In this tutorial you will learn how to construct an index and conduct searches.
# Introduction

Exact and approximate string matching is a common problem in bioinformatics, e.g. in read mapping.
Usually, we want to search for a match of one or many small sequences (queries) in a database consisting of one or
Usually, we want to search for a hit of one or many small sequences (queries) in a database consisting of one or
more large sequences (references). Trivial approaches such as on-line search fail at this task for large data due
to prohibitive run times.

Expand All @@ -39,8 +39,8 @@ search them efficiently using seqan3::search.

With this module you can:
* Create, store and load (Bi)-FM-Indices
* Search for exact matches
* Search for approximate matches (allowing substitutions and indels)
* Search for exact hits
* Search for approximate hits (allowing substitutions and indels)

The results of the search can be passed on to other modules, e.g. to create an alignment.

Expand Down Expand Up @@ -104,21 +104,20 @@ allow substitutions and indels, and how to configure what kind of results we wan
the best result.

## Terminology
### Match
If a query can be found in the reference with the specified error configuration, we denote this result as a match.
If the query matches the reference without any errors, we call this an exact match. If a match contains errors,
it's an approximate match.
### Hit
If a query can be found in the reference with the specified error configuration, we denote this result as a hit.
If the query is found in the reference sequence without any errors, we call this an exact hit, otherwise, if it contains errors,
it's an approximate hit.

## Searching for exact matches
## A hit is stored in a seqan3::search_result

We can search for all exact matches using seqan3::search:
\copydetails seqan3::search_result

\include doc/tutorial/search/search_basic_search.cpp
## Searching for exact hits

When using a single text, the search will return a std::vector of the begin positions of matches in the reference.
We can search for all exact hits using seqan3::search:

For text collections, a std::vector over std::tuple where the first element represents the index of the text in the
collection and the second element represents the position within the text is returned.
\include doc/tutorial/search/search_basic_search.cpp

You can also pass multiple queries at the same time:

Expand All @@ -143,15 +142,21 @@ std::vector<dna4_vector> text{"CGCTGTCTGAAGGATGAGTGTCAGCCAGTGTA"_dna4,
```console
===== Running on a single text =====
There are 3 hits.
The positions are [1,41,77]
The following hits were found:
<query_id:0, reference_id:0, reference_pos:1>
<query_id:0, reference_id:0, reference_pos:41>
<query_id:0, reference_id:0, reference_pos:77>

===== Running on a text collection =====
There are 3 hits.
The positions are [(0,1),(1,9),(2,16)]
The following hits were found:
<query_id:0, reference_id:0, reference_pos:1>
<query_id:0, reference_id:1, reference_pos:9>
<query_id:0, reference_id:2, reference_pos:16>
```
\endsolution

## Searching for approximate matches
## Searching for approximate hits

If you want to allow for errors in your query, you need to configure the approximate search with a
seqan3::search_cfg::max_error or seqan3::search_cfg::max_error_rate object.
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/search/search_basic_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ int main()
{
std::string text{"Garfield the fat cat without a hat."};
seqan3::fm_index index{text};
seqan3::debug_stream << search("cat"s, index) << '\n'; // [17]
seqan3::debug_stream << search("cat"s, index) << '\n'; // [<query_id:0, reference_id:0, reference_pos:17>]
}
10 changes: 8 additions & 2 deletions doc/tutorial/search/search_small_snippets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ seqan3::configuration const cfg = seqan3::search_cfg::max_error{seqan3::search_c
seqan3::search_cfg::substitution{0},
seqan3::search_cfg::insertion{1},
seqan3::search_cfg::deletion{1}};
seqan3::debug_stream << search("cat"s, index, cfg) << '\n'; // [14,17,18,32]
seqan3::debug_stream << search("cat"s, index, cfg) << '\n';
// prints: [<query_id:0, reference_id:0, reference_pos:14>,
// <query_id:0, reference_id:0, reference_pos:17>,
// <query_id:0, reference_id:0, reference_pos:18>,
// <query_id:0, reference_id:0, reference_pos:32>]
//![error_search]
}

Expand All @@ -75,7 +79,9 @@ seqan3::debug_stream << search("cat"s, index, cfg) << '\n'; // [14,17,18,32]
std::string text{"Garfield the fat cat without a hat."};
seqan3::fm_index index{text};
std::vector<std::string> query{"cat"s, "hat"s};
seqan3::debug_stream << search(query, index) << '\n'; // [[17],[31]]
seqan3::debug_stream << search(query, index) << '\n';
// prints: [<query_id:0, reference_id:0, reference_pos:17>,
// <query_id:1, reference_id:0, reference_pos:31>]
//![multiple_queries]
}

Expand Down
14 changes: 10 additions & 4 deletions doc/tutorial/search/search_solution2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@ void run_text_single()
text{"CGCTGTCTGAAGGATGAGTGTCAGCCAGTGTAACCCGATGAGCTACCCAGTAGTCGAACTGGGCCAGACAACCCGGCGCTAATGCACTCA"_dna4};
seqan3::fm_index index{text};

seqan3::debug_stream << "===== Running on a single text =====\n";
seqan3::debug_stream << "The positions are " << search("GCT"_dna4, index) << '\n';
seqan3::debug_stream << "===== Running on a single text =====\n"
<< "The following hits were found:\n";

for (auto && result : search("GCT"_dna4, index))
seqan3::debug_stream << result << '\n';
}

void run_text_collection()
Expand All @@ -22,8 +25,11 @@ void run_text_collection()
"GGCCAGACAACCCGGCGCTAATGCACTCA"_dna4};
seqan3::fm_index index{text};

seqan3::debug_stream << "===== Running on a text collection =====\n";
seqan3::debug_stream << "The positions are " << search("GCT"_dna4, index) << '\n';
seqan3::debug_stream << "===== Running on a text collection =====\n"
<< "The following hits were found:\n";

for (auto && result : search("GCT"_dna4, index))
seqan3::debug_stream << result << '\n';
}

int main()
Expand Down
6 changes: 3 additions & 3 deletions doc/tutorial/search/search_solution3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ int main()

seqan3::configuration const cfg = seqan3::search_cfg::max_error{seqan3::search_cfg::substitution{1}};

for (auto && pos : search("GCT"_dna4, index, cfg))
for (auto && result : search("GCT"_dna4, index, cfg))
{
seqan3::debug_stream << "At position " << pos << ": "
<< std::span{std::data(text) + pos.second, 3}
seqan3::debug_stream << "At position " << result.reference_begin_pos() << ": "
<< std::span{std::data(text) + result.reference_begin_pos(), 3}
<< '\n';
}
}
16 changes: 12 additions & 4 deletions doc/tutorial/search/search_solution4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,31 @@ int main()
seqan3::configuration const cfg_all = seqan3::search_cfg::max_error{seqan3::search_cfg::total{1}} |
seqan3::search_cfg::hit_all;
auto results_all = search(query, index, cfg_all);
seqan3::debug_stream << "Hits: " << results_all << "\n";
// Attention: results_all is a pure std::ranges::input_range,
// so after calling std::ranges::distance, you cannot iterate over it again!
seqan3::debug_stream << "There are " << std::ranges::distance(results_all) << " hits.\n";

seqan3::debug_stream << "Searching all best hits\n";
seqan3::configuration const cfg_all_best = seqan3::search_cfg::max_error{seqan3::search_cfg::total{1}} |
seqan3::search_cfg::hit_all_best;
auto results_all_best = search(query, index, cfg_all_best);
seqan3::debug_stream << "Hits: " << results_all_best << "\n";
// Attention: results_all_best is a pure std::ranges::input_range,
// so after calling std::ranges::distance, you cannot iterate over it again!
seqan3::debug_stream << "There are " << std::ranges::distance(results_all_best) << " hits.\n";

seqan3::debug_stream << "Searching best hit\n";
seqan3::configuration const cfg_best = seqan3::search_cfg::max_error{seqan3::search_cfg::total{1}} |
seqan3::search_cfg::hit_single_best;
auto results_best = search(query, index, cfg_best);
seqan3::debug_stream << "Hits " << results_best << "\n";
// Attention: results_best is a pure std::ranges::input_range,
// so after calling std::ranges::distance, you cannot iterate over it again!
seqan3::debug_stream << "There is " << std::ranges::distance(results_best) << " hits.\n";

seqan3::debug_stream << "Searching all hits in the 1-stratum\n";
seqan3::configuration const cfg_strata = seqan3::search_cfg::max_error{seqan3::search_cfg::total{1}} |
seqan3::search_cfg::hit_strata{1};
auto results_strata = search(query, index, cfg_strata);
seqan3::debug_stream << "Hits: " << results_strata << "\n";
// Attention: results_strata is a pure std::ranges::input_range,
// so after calling std::ranges::distance, you cannot iterate over it again!
seqan3::debug_stream << "There are " << std::ranges::distance(results_strata) << " hits.\n";
}
12 changes: 6 additions & 6 deletions doc/tutorial/search/search_solution5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ void run_text_single()
seqan3::align_cfg::aligned_ends{seqan3::free_ends_first} |
seqan3::align_cfg::result{seqan3::with_alignment};

auto results = search(query, index, search_config);
auto search_results = search(query, index, search_config);

seqan3::debug_stream << "-----------------\n";

for (auto && pos : results)
for (auto && hit : search_results)
{
size_t start = pos.second ? pos.second - 1 : 0;
size_t start = hit.reference_begin_pos() ? hit.reference_begin_pos() - 1 : 0;
std::span text_view{std::data(text) + start, query.size() + 1};

for (auto && res : align_pairwise(std::tie(text_view, query), align_config))
Expand Down Expand Up @@ -61,10 +61,10 @@ void run_text_collection()

seqan3::debug_stream << "-----------------\n";

for (auto [query_idx, text_pos] : search(query, index, search_config))
for (auto && hit : search(query, index, search_config))
{
size_t start = text_pos.second ? text_pos.second - 1 : 0;
std::span text_view{std::data(text[text_pos.first]) + start, query.size() + 1};
size_t start = hit.reference_begin_pos() ? hit.reference_begin_pos() - 1 : 0;
std::span text_view{std::data(text[hit.reference_id()]) + start, query.size() + 1};

for (auto && res : align_pairwise(std::tie(text_view, query), align_config))
{
Expand Down
4 changes: 4 additions & 0 deletions include/seqan3/search/configuration/all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@
* | \ref search_configuration_subsection_hit_strategy "3. Hit" | ✅ | ✅ | ✅ | ❌ | ✅ |
* | \ref seqan3::search_cfg::parallel "4: Parallel" | ✅ | ✅ | ✅ | ✅ | ❌ |
*
* \subsection search_configuration_search_result Search result type
*
* \copydetails seqan3::search_result
*
* \subsection search_configuration_subsection_hit_strategy 3. Hit Configuration
*
* This configuration can be used to determine which hits are reported.
Expand Down
Loading

0 comments on commit 4fcbeb8

Please sign in to comment.