Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve search backend: add stop words, use "federated search", highlight creators #1319

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions backend/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ embed-in-debug = ["reinda/always-prod"]


[dependencies]
ahash = "0.8"
anyhow = { version = "1.0.71", features = ["backtrace"] }
base64 = "0.22.1"
bincode = "1.3.3"
Expand All @@ -42,7 +43,7 @@ hyper-rustls = { version = "0.27.3", default-features = false, features = ["http
hyper-util = { version = "0.1.3", features = ["client", "server", "http1", "http2"] }
iso8601 = "0.6.1"
juniper = { version = "0.16.1", default-features = false, features = ["chrono", "schema-language", "anyhow", "backtrace"] }
meilisearch-sdk = "0.27.1"
meilisearch-sdk = { path = "vendor/meilisearch-sdk" }
mime_guess = { version = "2", default-features = false }
nu-ansi-term = "0.50.1"
ogrim = "0.1.1"
Expand Down
31 changes: 25 additions & 6 deletions backend/src/api/model/search/event.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::collections::HashMap;

use chrono::{DateTime, Utc};
use juniper::GraphQLObject;
use meilisearch_sdk::search::SearchResult;
use meilisearch_sdk::search::MatchRange;

use crate::{
api::{Context, Id, Node, NodeValue},
Expand Down Expand Up @@ -39,7 +41,13 @@ pub struct SearchEventMatches {
title: Vec<ByteSpan>,
description: Vec<ByteSpan>,
series_title: Vec<ByteSpan>,
// TODO: creators
creators: Vec<ArrayMatch>,
}

#[derive(Debug, GraphQLObject)]
pub struct ArrayMatch {
index: i32,
span: ByteSpan,
}

/// A match inside an event's texts while searching.
Expand Down Expand Up @@ -75,10 +83,11 @@ impl SearchEvent {
Self::new_inner(src, vec![], SearchEventMatches::default(), user_can_read)
}

pub(crate) fn new(hit: SearchResult<search::Event>, context: &Context) -> Self {
let match_positions = hit.matches_position.as_ref();
let src = hit.result;

pub(crate) fn new(
src: search::Event,
match_positions: Option<&HashMap<String, Vec<MatchRange>>>,
context: &Context,
) -> Self {
let mut text_matches = Vec::new();
let read_roles = decode_acl(&src.read_roles);
let user_can_read = context.auth.overlaps_roles(read_roles);
Expand All @@ -99,6 +108,16 @@ impl SearchEvent {
title: field_matches_for(match_positions, "title"),
description: field_matches_for(match_positions, "description"),
series_title: field_matches_for(match_positions, "series_title"),
creators: match_ranges_for(match_positions, "creators")
.iter()
.filter_map(|m| {
m.indices.as_ref().and_then(|v| v.get(0)).map(|index| ArrayMatch {
span: ByteSpan { start: m.start as i32, len: m.length as i32 },
index: *index as i32,
})
})
.take(8)
.collect(),
};

Self::new_inner(src, text_matches, matches, user_can_read)
Expand Down
119 changes: 49 additions & 70 deletions backend/src/api/model/search/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use chrono::{DateTime, Utc};
use juniper::GraphQLObject;
use meilisearch_sdk::search::MatchRange;
use meilisearch_sdk::search::{FederationOptions, MatchRange, QueryFederationOptions};
use once_cell::sync::Lazy;
use regex::Regex;
use std::{borrow::Cow, collections::HashMap, fmt, time::Instant};
Expand Down Expand Up @@ -204,97 +204,72 @@ pub(crate) async fn perform(
]).to_string();
let event_query = context.search.event_index.search()
.with_query(user_query)
.with_limit(15)
.with_show_matches_position(true)
.with_filter(&filter)
.with_show_ranking_score(true)
.build();


// Prepare the series search
let series_query = context.search.series_index.search()
.with_query(user_query)
.with_show_matches_position(true)
.with_filter("listed = true")
.with_limit(15)
.with_show_ranking_score(true)
.with_federation_options(QueryFederationOptions {
weight: Some(1.1),
})
.build();


// Prepare the realm search
let realm_query = context.search.realm_index.search()
.with_query(user_query)
.with_limit(10)
.with_filter("is_user_realm = false")
.with_show_matches_position(true)
.with_show_ranking_score(true)
.build();


// Perform the searches
let res = tokio::try_join!(
event_query.execute::<search::Event>(),
series_query.execute::<search::Series>(),
realm_query.execute::<search::Realm>(),
);
let (event_results, series_results, realm_results) = handle_search_result!(res, SearchOutcome);

// Merge results according to Meilis score.
//
// TODO: Comparing scores of different indices is not well defined right now.
// We can either use score details or adding dummy searchable fields to the
// realm index. See this discussion for more info:
// https://github.com/orgs/meilisearch/discussions/489#discussioncomment-6160361
let events = event_results.hits.into_iter().map(|result| {
let score = result.ranking_score;
(NodeValue::from(SearchEvent::new(result, &context)), score)
});
let series = series_results.hits.into_iter().map(|result| {
let score = result.ranking_score;
(NodeValue::from(SearchSeries::new(result, context)), score)
});
let realms = realm_results.hits.into_iter().map(|result| {
let score = result.ranking_score;
(NodeValue::from(SearchRealm::new(result)), score)
let mut multi_search = context.search.client.multi_search();
if matches!(filters.item_type, None | Some(ItemType::Event)) {
multi_search.with_search_query(event_query);
}
if matches!(filters.item_type, None | Some(ItemType::Series)) {
multi_search.with_search_query(series_query);
}
if matches!(filters.item_type, None | Some(ItemType::Realm)) {
multi_search.with_search_query(realm_query);
}
let multi_search = multi_search.with_federation(FederationOptions {
limit: Some(30),
offset: Some(0), // TODO: pagination
..Default::default()
});

let mut merged: Vec<(NodeValue, Option<f64>)> = Vec::new();
let total_hits: usize;

match filters.item_type {
Some(ItemType::Event) => {
merged.extend(events);
total_hits = event_results.estimated_total_hits.unwrap_or(0);
},
Some(ItemType::Series) => {
merged.extend(series);
total_hits = series_results.estimated_total_hits.unwrap_or(0);
},
Some(ItemType::Realm) => {
merged.extend(realms);
total_hits = realm_results.estimated_total_hits.unwrap_or(0);
},
None => {
merged.extend(events);
merged.extend(series);
merged.extend(realms);
total_hits = [
event_results.estimated_total_hits,
series_results.estimated_total_hits,
realm_results.estimated_total_hits,
]
.iter()
.filter_map(|&x| x)
.sum();
},
}

merged.sort_unstable_by(|(_, score0), (_, score1)| score1.unwrap().total_cmp(&score0.unwrap()));
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum MultiSearchItem {
Event(search::Event),
Series(search::Series),
Realm(search::Realm),
}

let items = merged.into_iter().map(|(node, _)| node).collect();
// TODO: Check if sort order makes sense. That's because comparing scores of
// different indices is not well defined right now. We can either use score
// details or adding dummy searchable fields to the realm index. See this
// discussion for more info:
// https://github.com/orgs/meilisearch/discussions/489#discussioncomment-6160361
let res = handle_search_result!(multi_search.execute::<MultiSearchItem>().await, SearchOutcome);

let items = res.hits.into_iter()
.map(|res| {
let mp = res.matches_position.as_ref();
match res.result {
MultiSearchItem::Event(event) => NodeValue::from(SearchEvent::new(event, mp, &context)),
MultiSearchItem::Series(series) => NodeValue::from(SearchSeries::new(series, mp, context)),
MultiSearchItem::Realm(realm) => NodeValue::from(SearchRealm::new(realm, mp)),
}
})
.collect();
Ok(SearchOutcome::Results(SearchResults {
items,
total_hits,
total_hits: res.estimated_total_hits,
duration: elapsed_time(),
}))
}
Expand Down Expand Up @@ -353,7 +328,9 @@ pub(crate) async fn all_events(
}
let res = query.execute::<search::Event>().await;
let results = handle_search_result!(res, EventSearchOutcome);
let items = results.hits.into_iter().map(|h| SearchEvent::new(h, &context)).collect();
let items = results.hits.into_iter()
.map(|h| SearchEvent::new(h.result, h.matches_position.as_ref(), &context))
.collect();
let total_hits = results.estimated_total_hits.unwrap_or(0);

Ok(EventSearchOutcome::Results(SearchResults { items, total_hits, duration: elapsed_time() }))
Expand Down Expand Up @@ -405,7 +382,9 @@ pub(crate) async fn all_series(
}
let res = query.execute::<search::Series>().await;
let results = handle_search_result!(res, SeriesSearchOutcome);
let items = results.hits.into_iter().map(|h| SearchSeries::new(h, context)).collect();
let items = results.hits.into_iter()
.map(|h| SearchSeries::new(h.result, h.matches_position.as_ref(), context))
.collect();
let total_hits = results.estimated_total_hits.unwrap_or(0);

Ok(SeriesSearchOutcome::Results(SearchResults { items, total_hits, duration: elapsed_time() }))
Expand Down
12 changes: 8 additions & 4 deletions backend/src/api/model/search/realm.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::collections::HashMap;

use juniper::GraphQLObject;
use meilisearch_sdk::search::SearchResult;
use meilisearch_sdk::search::MatchRange;

use crate::{
api::{Context, Node, Id, NodeValue},
Expand Down Expand Up @@ -36,12 +38,14 @@ impl SearchRealm {
Self::new_inner(src, SearchRealmMatches::default())
}

pub(crate) fn new(hit: SearchResult<search::Realm>) -> Self {
let match_positions = hit.matches_position.as_ref();
pub(crate) fn new(
src: search::Realm,
match_positions: Option<&HashMap<String, Vec<MatchRange>>>,
) -> Self {
let matches = SearchRealmMatches {
name: field_matches_for(match_positions, "name"),
};
Self::new_inner(hit.result, matches)
Self::new_inner(src, matches)
}

fn new_inner(src: search::Realm, matches: SearchRealmMatches) -> Self {
Expand Down
9 changes: 5 additions & 4 deletions backend/src/api/model/search/series.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::collections::HashMap;

use juniper::GraphQLObject;
use meilisearch_sdk::search::SearchResult;
use meilisearch_sdk::search::MatchRange;

use crate::{
api::{Context, Id, Node, NodeValue},
Expand Down Expand Up @@ -36,16 +38,15 @@ impl Node for SearchSeries {

impl SearchSeries {
pub(crate) fn new(
hit: SearchResult<search::Series>,
src: search::Series,
match_positions: Option<&HashMap<String, Vec<MatchRange>>>,
context: &Context,
) -> Self {
let match_positions = hit.matches_position.as_ref();
let matches = SearchSeriesMatches {
title: field_matches_for(match_positions, "title"),
description: field_matches_for(match_positions, "description"),
};

let src = hit.result;
Self {
id: Id::search_series(src.id.0),
opencast_id: src.opencast_id,
Expand Down
18 changes: 17 additions & 1 deletion backend/src/search/event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ use crate::{
util::{base64_decode, BASE64_DIGITS},
};

use super::{realm::Realm, util::{self, FieldAbilities}, IndexItem, IndexItemKind, SearchId};
use super::{
realm::Realm,
util::{self, is_stop_word, FieldAbilities},
IndexItem, IndexItemKind, SearchId,
};



Expand Down Expand Up @@ -366,6 +370,18 @@ impl TextSearchIndex {
continue;
}

// Get correct indices and the actual text snippet. Unfortunately,
// Meilisearch might sometimes return invalid indices that slice
// UTF-8 codepoints in half, so we need to protect against that.
let start = ceil_char_boundary(&self.texts, match_range.start);
let end = ceil_char_boundary(&self.texts, match_range.start + match_range.length);
let snippet = &self.texts[start..end];

// If the match is a single stop word, we ignore it.
if is_stop_word(snippet) {
continue;
}

let slot = self.lookup(match_range);
let matches = entries.entry(slot as u32).or_insert_with(Vec::new);

Expand Down
2 changes: 1 addition & 1 deletion backend/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ pub(crate) async fn rebuild_if_necessary(
for task in tasks {
util::wait_on_task(task, meili).await?;
}
info!("Completely rebuild search index");
info!("Completely rebuilt search index");

meili.meta_index.add_or_replace(&[meta::Meta::current_clean()], None).await
.context("failed to update index version document (clean)")?;
Expand Down
Loading
Loading