Skip to content

Commit

Permalink
feat(ai-help): index text-embedding-3-model embeddings (#10818)
Browse files Browse the repository at this point in the history
* feat(ai-help): index text-embedding-3-model embeddings

* fix(ai-help): extract constants with correct v3 model name

* feat(ai-help): add embedding backfill mechanism

* refactor(ai-help): avoid SQL generation

* chore(ai-help): set embedding_next = null unless EMBEDDING_MODEL_NEXT

* fix(ai-help): add embeddings also if formatting changed

* fixup! fix(ai-help): extract constants with correct v3 model name

* perf(ai-help): generate both embeddings in parallel
  • Loading branch information
caugner authored Apr 2, 2024
1 parent 1ffb026 commit f7cfaae
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 21 deletions.
158 changes: 138 additions & 20 deletions scripts/ai-help-macros.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,18 @@ import {
} from "@mdn/browser-compat-data/types";
import { h2mSync } from "../markdown/index.js";

const EMBEDDING_MODEL = "text-embedding-ada-002";
const EMBEDDING_MODEL_NEXT = "text-embedding-3-small";

const { program } = caporal;

interface IndexedDoc {
id: number;
mdn_url: string;
title: string;
token_count: number | null;
has_embedding: boolean;
has_embedding_next: boolean;
markdown_hash: string;
text_hash: string;
}
Expand All @@ -43,6 +48,16 @@ interface Doc {
text_hash?: string;
}

type FormattingUpdate = Pick<
Doc,
"mdn_url" | "title" | "title_short" | "markdown" | "markdown_hash"
>;

type EmbeddingUpdate = Pick<Doc, "mdn_url" | "text"> & {
has_embedding: boolean;
has_embedding_next: boolean;
};

export async function updateEmbeddings(
directory: string,
updateFormatting: boolean
Expand All @@ -65,11 +80,11 @@ export async function updateEmbeddings(
apiKey: OPENAI_KEY,
});

const createEmbedding = async (input: string) => {
const createEmbedding = async (input: string, model: string) => {
let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse;
try {
embeddingResponse = await openai.embeddings.create({
model: "text-embedding-ada-002",
model,
input,
});
} catch ({ error: { message, type }, status }: any) {
Expand All @@ -78,7 +93,7 @@ export async function updateEmbeddings(
);
// Try again with trimmed content.
embeddingResponse = await openai.embeddings.create({
model: "text-embedding-ada-002",
model,
input: input.substring(0, 15000),
});
}
Expand Down Expand Up @@ -106,7 +121,8 @@ export async function updateEmbeddings(

const seenUrls = new Set<string>();
const updates: Doc[] = [];
const formattingUpdates: Doc[] = [];
const formattingUpdates: FormattingUpdate[] = [];
const embeddingUpdates: EmbeddingUpdate[] = [];

for await (const { mdn_url, title, title_short, markdown, text } of builtDocs(
directory
Expand All @@ -122,6 +138,7 @@ export async function updateEmbeddings(
.digest("base64");

if (existingDoc?.text_hash !== text_hash) {
// Document added or content changed => (re)generate embeddings.
updates.push({
mdn_url,
title,
Expand All @@ -131,31 +148,55 @@ export async function updateEmbeddings(
text,
text_hash,
});
} else if (
updateFormatting ||
existingDoc?.markdown_hash !== markdown_hash
) {
formattingUpdates.push({
mdn_url,
title,
title_short,
markdown,
markdown_hash,
});
} else {
if (updateFormatting || existingDoc?.markdown_hash !== markdown_hash) {
// Document formatting changed => update markdown.
formattingUpdates.push({
mdn_url,
title,
title_short,
markdown,
markdown_hash,
});
}

if (
!existingDoc.has_embedding ||
!existingDoc.has_embedding_next !== !EMBEDDING_MODEL_NEXT
) {
// Embedding missing => add embeddings.
const { has_embedding, has_embedding_next } = existingDoc;
embeddingUpdates.push({
mdn_url,
text,
has_embedding,
has_embedding_next,
});
}
}
}

console.log(
`-> ${updates.length} (${formattingUpdates.length}) of ${seenUrls.size} documents were changed or added (or formatted).`
);
if (embeddingUpdates.length > 0) {
console.log(
`-> ${embeddingUpdates.length} documents have outdated embeddings.`
);
}

const deletions: IndexedDoc[] = [...existingDocByUrl.entries()]
.filter(([key]) => !seenUrls.has(key))
.map(([, value]) => value);
console.log(
`-> ${deletions.length} of ${existingDocs.length} indexed documents were deleted (or moved).`
);

if (updates.length > 0 || formattingUpdates.length > 0) {
if (
updates.length > 0 ||
formattingUpdates.length > 0 ||
embeddingUpdates.length > 0
) {
console.log(`Applying updates...`);
for (const {
mdn_url,
Expand All @@ -170,7 +211,16 @@ export async function updateEmbeddings(
console.log(`-> [${mdn_url}] Updating document...`);

// Embedding for full document.
const { total_tokens, embedding } = await createEmbedding(text);
const [{ total_tokens, embedding }, embedding_next] = await Promise.all(
[
createEmbedding(text, EMBEDDING_MODEL),
EMBEDDING_MODEL_NEXT
? createEmbedding(text, EMBEDDING_MODEL_NEXT).then(
({ embedding }) => embedding
)
: null,
]
);

// Create/update document record.
const query = {
Expand All @@ -184,9 +234,10 @@ export async function updateEmbeddings(
markdown_hash,
token_count,
embedding,
embedding_next,
text_hash
)
VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (mdn_url) DO
UPDATE
SET mdn_url = $1,
title = $2,
Expand All @@ -195,7 +246,8 @@ export async function updateEmbeddings(
markdown_hash = $5,
token_count = $6,
embedding = $7,
text_hash = $8
embedding_next = $8,
text_hash = $9
`,
values: [
mdn_url,
Expand All @@ -205,6 +257,7 @@ export async function updateEmbeddings(
markdown_hash,
total_tokens,
pgvector.toSql(embedding),
embedding_next ? pgvector.toSql(embedding_next) : null,
text_hash,
],
rowMode: "array",
Expand All @@ -217,6 +270,7 @@ export async function updateEmbeddings(
console.error(context);
}
}

for (const {
mdn_url,
title,
Expand Down Expand Up @@ -253,6 +307,57 @@ export async function updateEmbeddings(
console.error(context);
}
}

for (const {
mdn_url,
text,
has_embedding,
has_embedding_next,
} of embeddingUpdates) {
try {
console.log(`-> [${mdn_url}] Updating embeddings...`);

if (!has_embedding) {
const { total_tokens, embedding } = await createEmbedding(
text,
EMBEDDING_MODEL
);

const query = {
name: "upsert-doc-embedding",
text: "UPDATE mdn_doc_macro SET total_tokens = $2, embedding = $3 WHERE mdn_url = $1",
values: [
mdn_url,
total_tokens,
embedding ? pgvector.toSql(embedding) : null,
],
rowMode: "array",
};

await pgClient.query(query);
}

if (!has_embedding_next) {
const embedding = EMBEDDING_MODEL_NEXT
? (await createEmbedding(text, EMBEDDING_MODEL_NEXT)).embedding
: null;

const query = {
name: "upsert-doc-embedding-next",
text: "UPDATE mdn_doc_macro SET embedding_next = $2 WHERE mdn_url = $1",
values: [mdn_url, embedding ? pgvector.toSql(embedding) : null],
rowMode: "array",
};

await pgClient.query(query);
}
} catch (err: any) {
console.error(`!> [${mdn_url}] Failed to add embeddings.`);
const context = err?.response?.data ?? err?.response ?? err;
console.error(context);
}
}

console.log(`-> Done.`);
}

Expand Down Expand Up @@ -508,6 +613,8 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
mdn_url,
title,
token_count,
embedding IS NOT NULL as has_embedding,
embedding_next IS NOT NULL as has_embedding_next,
markdown_hash,
text_hash
from mdn_doc_macro
Expand All @@ -520,12 +627,23 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
};
const result = await pgClient.query(query);
return result.rows.map(
([id, mdn_url, title, token_count, markdown_hash, text_hash]) => {
([
id,
mdn_url,
title,
token_count,
has_embedding,
has_embedding_next,
markdown_hash,
text_hash,
]) => {
return {
id,
mdn_url,
title,
token_count,
has_embedding,
has_embedding_next,
markdown_hash,
text_hash,
};
Expand Down
3 changes: 2 additions & 1 deletion scripts/ai-help.sql
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ create table
html text null,
markdown text null,
token_count integer null,
embedding extensions.vector null,
embedding extensions.vector(1536) null,
embedding_next extensions.vector(1536) null,
text_hash text null,
constraint mdn_doc_macro_pkey primary key (id),
constraint mdn_doc_macro_url_key unique (mdn_url)
Expand Down

0 comments on commit f7cfaae

Please sign in to comment.