Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ai-help): add parent short_title for duplicate source titles #428

Merged
merged 6 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/ai/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,21 @@ mod test {
RelatedDoc {
url: "".into(),
title: "".into(),
title_parent: None,
content: "content1".into(),
similarity: 0f64,
},
RelatedDoc {
url: "".into(),
title: "".into(),
title_parent: None,
content: "content2".into(),
similarity: 0f64,
},
RelatedDoc {
url: "".into(),
title: "".into(),
title_parent: None,
content: "content3".into(),
similarity: 0f64,
},
Expand Down
44 changes: 32 additions & 12 deletions src/ai/embeddings.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use async_openai::{config::OpenAIConfig, types::CreateEmbeddingRequestArgs, Client};
use itertools::Itertools;

use crate::{
ai::{constants::EMBEDDING_MODEL, error::AIError},
Expand Down Expand Up @@ -41,22 +42,25 @@ const MACRO_EMB_DISTANCE: f64 = 0.78;
const MACRO_EMB_SEC_MIN_LENGTH: i64 = 50;
const MACRO_EMB_DOC_LIMIT: i64 = 5;

const MACRO_DOCS_QUERY: &str = "select
mdn_doc_macro.mdn_url as url,
mdn_doc_macro.title,
mdn_doc_macro.markdown as content,
mdn_doc_macro.embedding <=> $1 as similarity
from mdn_doc_macro
where length(mdn_doc_macro.markdown) >= $4
and (mdn_doc_macro.embedding <=> $1) < $2
and mdn_doc_macro.mdn_url not like '/en-US/docs/MDN%'
order by mdn_doc_macro.embedding <=> $1
limit $3;";
const MACRO_DOCS_QUERY: &str = "SELECT
doc.mdn_url AS url,
doc.title,
parent.title_short AS title_parent,
doc.markdown AS content,
doc.embedding <=> $1 AS similarity
FROM mdn_doc_macro doc
LEFT JOIN mdn_doc_macro parent ON parent.mdn_url = SUBSTRING(doc.mdn_url, 1, LENGTH(doc.mdn_url) - STRPOS(REVERSE(doc.mdn_url), '/'))
WHERE LENGTH(doc.markdown) >= $4
AND (doc.embedding <=> $1) < $2
AND doc.mdn_url NOT LIKE '/en-US/docs/MDN%'
ORDER BY doc.embedding <=> $1
LIMIT $3;";

#[derive(sqlx::FromRow, Debug)]
pub struct RelatedDoc {
pub url: String,
pub title: String,
pub title_parent: Option<String>,
pub content: String,
pub similarity: f64,
}
Expand All @@ -74,13 +78,29 @@ pub async fn get_related_macro_docs(

let embedding =
pgvector::Vector::from(embedding_res.data.into_iter().next().unwrap().embedding);
let docs: Vec<RelatedDoc> = sqlx::query_as(MACRO_DOCS_QUERY)

let mut docs: Vec<RelatedDoc> = sqlx::query_as(MACRO_DOCS_QUERY)
.bind(embedding)
.bind(MACRO_EMB_DISTANCE)
.bind(MACRO_EMB_DOC_LIMIT)
.bind(MACRO_EMB_SEC_MIN_LENGTH)
.fetch_all(pool)
.await?;

let duplicate_titles: Vec<String> = docs
.iter()
.map(|x| x.title.to_string())
.duplicates()
.collect();

docs.iter_mut().for_each(|doc| {
if let (true, Some(title_parent)) =
(duplicate_titles.contains(&doc.title), &doc.title_parent)
{
doc.title = format!("{} ({})", doc.title, title_parent);
}
});

Ok(docs)
}

Expand Down
Loading