From 337d0b154b476e17d73a0e0a866f645add31bd37 Mon Sep 17 00:00:00 2001 From: Florian Dieminger Date: Tue, 30 Jan 2024 11:49:43 +0100 Subject: [PATCH] feat(ai-help): index content as markdown (#10330) Convert the document for context to markdown. This reduces the token count by ~18% on average. Co-authored-by: Claas Augner --- markdown/h2m/index.ts | 15 ++++++++ markdown/index.ts | 1 + package.json | 3 ++ scripts/ai-help-macros.ts | 44 ++++++++++++++++------- scripts/ai-help.sql | 1 + yarn.lock | 75 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 markdown/h2m/index.ts diff --git a/markdown/h2m/index.ts b/markdown/h2m/index.ts new file mode 100644 index 000000000000..d6fb26e9a98b --- /dev/null +++ b/markdown/h2m/index.ts @@ -0,0 +1,15 @@ +import { unified } from "unified"; +import rehypeParse from "rehype-parse"; +import rehypeRemark from "rehype-remark"; +import remarkStringify from "remark-stringify"; +import remarkGfm from "remark-gfm"; + +export function h2mSync(html: string) { + const file = unified() + .use(rehypeParse) + .use(rehypeRemark) + .use(remarkGfm) + .use(remarkStringify) + .processSync(html); + return String(file); +} diff --git a/markdown/index.ts b/markdown/index.ts index ccfbace1718c..8142c76a8bae 100644 --- a/markdown/index.ts +++ b/markdown/index.ts @@ -1,2 +1,3 @@ export * from "./utils/index.js"; export * from "./m2h/index.js"; +export * from "./h2m/index.js"; diff --git a/package.json b/package.json index a9928caacab8..369170a778c8 100644 --- a/package.json +++ b/package.json @@ -126,12 +126,15 @@ "react-modal": "^3.16.1", "read-chunk": "^4.0.3", "rehype-format": "^5.0.0", + "rehype-parse": "^9.0.0", "rehype-raw": "^7.0.0", + "rehype-remark": "^10.0.0", "rehype-sanitize": "^6.0.0", "rehype-stringify": "^10.0.0", "remark-gfm": "^4.0.0", "remark-parse": "^11.0.0", "remark-rehype": "^11.1.0", + "remark-stringify": "^11.0.0", "sanitize-filename": "^1.6.3", "send": "^0.18.0", "source-map-support": "^0.5.21", diff --git a/scripts/ai-help-macros.ts b/scripts/ai-help-macros.ts index 0682e03b0149..c3d933ba7890 100644 --- a/scripts/ai-help-macros.ts +++ b/scripts/ai-help-macros.ts @@ -20,6 +20,7 @@ import { SimpleSupportStatement, VersionValue, } from "@mdn/browser-compat-data/types"; +import { h2mSync } from "../markdown/index.js"; const { program } = caporal; @@ -37,6 +38,7 @@ interface Doc { title: string; hash: string; html: string; + markdown: string; text?: string; text_hash?: string; } @@ -106,7 +108,7 @@ export async function updateEmbeddings( const updates: Doc[] = []; const formattingUpdates: Doc[] = []; - for await (const { mdn_url, title, hash, html, text } of builtDocs( + for await (const { mdn_url, title, hash, html, markdown, text } of builtDocs( directory )) { seenUrls.add(mdn_url); @@ -122,6 +124,7 @@ export async function updateEmbeddings( title, hash, html, + markdown, text, text_hash, }); @@ -131,6 +134,7 @@ export async function updateEmbeddings( title, hash, html, + markdown, }); } } @@ -147,7 +151,15 @@ export async function updateEmbeddings( if (updates.length > 0 || formattingUpdates.length > 0) { console.log(`Applying updates...`); - for (const { mdn_url, title, hash, html, text, text_hash } of updates) { + for (const { + mdn_url, + title, + hash, + html, + markdown, + text, + text_hash, + } of updates) { try { console.log(`-> [${mdn_url}] Updating document...`); @@ -163,25 +175,28 @@ export async function updateEmbeddings( title, hash, html, + markdown, token_count, embedding, text_hash ) - VALUES($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (mdn_url) DO + VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO UPDATE SET mdn_url = $1, title = $2, hash = $3, html = $4, - token_count = $5, - embedding = $6, - text_hash = $7 + markdown = $5, + token_count = $6, + embedding = $7, + text_hash = $8 `, values: [ mdn_url, title, hash, html, + markdown, total_tokens, pgvector.toSql(embedding), text_hash, @@ -196,7 +211,7 @@ export async function updateEmbeddings( console.error(context); } } - for (const { mdn_url, title, hash, html } of formattingUpdates) { + for (const { mdn_url, title, hash, html, markdown } of formattingUpdates) { try { console.log( `-> [${mdn_url}] Updating document without generating new embedding...` @@ -206,15 +221,16 @@ export async function updateEmbeddings( const query = { name: "upsert-doc", text: ` - INSERT INTO mdn_doc_macro(mdn_url, title, hash, html) - VALUES($1, $2, $3, $4) ON CONFLICT (mdn_url) DO + INSERT INTO mdn_doc_macro(mdn_url, title, hash, html, markdown) + VALUES($1, $2, $3, $4, $5) ON CONFLICT (mdn_url) DO UPDATE SET mdn_url = $1, title = $2, hash = $3, - html = $4 + html = $4, + markdown = $5 `, - values: [mdn_url, title, hash, html], + values: [mdn_url, title, hash, html, markdown], rowMode: "array", }; @@ -247,8 +263,8 @@ export async function updateEmbeddings( } async function formatDocs(directory: string) { - for await (const { html, text } of builtDocs(directory)) { - console.log(html, text); + for await (const { html, markdown, text } of builtDocs(directory)) { + console.log(html, markdown, text); } } @@ -288,6 +304,7 @@ async function* builtDocs(directory: string) { $(el).replaceWith(buildBCDTable($(el).data("query") as string)); }); const html = $.html(); + const markdown = h2mSync(html); // reformat text version, used for embedding $("title").remove(); @@ -299,6 +316,7 @@ async function* builtDocs(directory: string) { title, hash, html, + markdown, text, }; } catch (e) { diff --git a/scripts/ai-help.sql b/scripts/ai-help.sql index a558f4317caf..98f5ac782313 100644 --- a/scripts/ai-help.sql +++ b/scripts/ai-help.sql @@ -31,6 +31,7 @@ create table title text not null, mdn_url text not null, html text null, + markdown text null, token_count integer null, embedding extensions.vector null, text_hash text null, diff --git a/yarn.lock b/yarn.lock index 26a4e64b1b29..8dbe0c9e4387 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7754,6 +7754,18 @@ hast-util-embedded@^3.0.0: "@types/hast" "^3.0.0" hast-util-is-element "^3.0.0" +hast-util-from-html@^2.0.0: + version "2.0.1" + resolved "https://registry.yarnpkg.com/hast-util-from-html/-/hast-util-from-html-2.0.1.tgz#9cd38ee81bf40b2607368b92a04b0905fa987488" + integrity sha512-RXQBLMl9kjKVNkJTIO6bZyb2n+cUH8LFaSSzo82jiLT6Tfc+Pt7VQCS+/h3YwG4jaNE2TA2sdJisGWR+aJrp0g== + dependencies: + "@types/hast" "^3.0.0" + devlop "^1.1.0" + hast-util-from-parse5 "^8.0.0" + parse5 "^7.0.0" + vfile "^6.0.0" + vfile-message "^4.0.0" + hast-util-from-parse5@^8.0.0: version "8.0.1" resolved "https://registry.yarnpkg.com/hast-util-from-parse5/-/hast-util-from-parse5-8.0.1.tgz#654a5676a41211e14ee80d1b1758c399a0327651" @@ -7868,6 +7880,26 @@ hast-util-to-jsx-runtime@^2.0.0: unist-util-position "^5.0.0" vfile-message "^4.0.0" +hast-util-to-mdast@^10.0.0: + version "10.1.0" + resolved "https://registry.yarnpkg.com/hast-util-to-mdast/-/hast-util-to-mdast-10.1.0.tgz#906c80fc263a9f09a33462317ffc6ad94f4ee3db" + integrity sha512-DsL/SvCK9V7+vfc6SLQ+vKIyBDXTk2KLSbfBYkH4zeF/uR1yBajHRhkzuaUSGOB1WJSTieJBdHwxlC+HLKvZZw== + dependencies: + "@types/hast" "^3.0.0" + "@types/mdast" "^4.0.0" + "@ungap/structured-clone" "^1.0.0" + hast-util-phrasing "^3.0.0" + hast-util-to-html "^9.0.0" + hast-util-to-text "^4.0.0" + hast-util-whitespace "^3.0.0" + mdast-util-phrasing "^4.0.0" + mdast-util-to-hast "^13.0.0" + mdast-util-to-string "^4.0.0" + rehype-minify-whitespace "^6.0.0" + trim-trailing-lines "^2.0.0" + unist-util-position "^5.0.0" + unist-util-visit "^5.0.0" + hast-util-to-parse5@^8.0.0: version "8.0.0" resolved "https://registry.yarnpkg.com/hast-util-to-parse5/-/hast-util-to-parse5-8.0.0.tgz#477cd42d278d4f036bc2ea58586130f6f39ee6ed" @@ -7881,6 +7913,16 @@ hast-util-to-parse5@^8.0.0: web-namespaces "^2.0.0" zwitch "^2.0.0" +hast-util-to-text@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/hast-util-to-text/-/hast-util-to-text-4.0.0.tgz#7f33a45d0bf7981ead44e82d9d8d75f511b3642f" + integrity sha512-EWiE1FSArNBPUo1cKWtzqgnuRQwEeQbQtnFJRYV1hb1BWDgrAlBU0ExptvZMM/KSA82cDpm2sFGf3Dmc5Mza3w== + dependencies: + "@types/hast" "^3.0.0" + "@types/unist" "^3.0.0" + hast-util-is-element "^3.0.0" + unist-util-find-after "^5.0.0" + hast-util-whitespace@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz#7778ed9d3c92dd9e8c5c8f648a49c21fc51cb621" @@ -12870,6 +12912,15 @@ rehype-minify-whitespace@^6.0.0: hast-util-whitespace "^3.0.0" unist-util-is "^6.0.0" +rehype-parse@^9.0.0: + version "9.0.0" + resolved "https://registry.yarnpkg.com/rehype-parse/-/rehype-parse-9.0.0.tgz#3949faeec6f466ec57774215661e0d75469195d9" + integrity sha512-WG7nfvmWWkCR++KEkZevZb/uw41E8TsH4DsY9UxsTbIXCVGbAs4S+r8FrQ+OtH5EEQAs+5UxKC42VinkmpA1Yw== + dependencies: + "@types/hast" "^3.0.0" + hast-util-from-html "^2.0.0" + unified "^11.0.0" + rehype-raw@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/rehype-raw/-/rehype-raw-7.0.0.tgz#59d7348fd5dbef3807bbaa1d443efd2dd85ecee4" @@ -12879,6 +12930,17 @@ rehype-raw@^7.0.0: hast-util-raw "^9.0.0" vfile "^6.0.0" +rehype-remark@^10.0.0: + version "10.0.0" + resolved "https://registry.yarnpkg.com/rehype-remark/-/rehype-remark-10.0.0.tgz#de15bf1f920ce519291848cd0d99aabaad44cf71" + integrity sha512-+aDXY/icqMFOafJQomVjxe3BAP7aR3lIsQ3GV6VIwpbCD2nvNFOXjGvotMe5p0Ny+Gt6L13DhEf/FjOOpTuUbQ== + dependencies: + "@types/hast" "^3.0.0" + "@types/mdast" "^4.0.0" + hast-util-to-mdast "^10.0.0" + unified "^11.0.0" + vfile "^6.0.0" + rehype-sanitize@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/rehype-sanitize/-/rehype-sanitize-6.0.0.tgz#16e95f4a67a69cbf0f79e113c8e0df48203db73c" @@ -14686,6 +14748,11 @@ trim-repeated@^1.0.0: dependencies: escape-string-regexp "^1.0.2" +trim-trailing-lines@^2.0.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/trim-trailing-lines/-/trim-trailing-lines-2.1.0.tgz#9aac7e89b09cb35badf663de7133c6de164f86df" + integrity sha512-5UR5Biq4VlVOtzqkm2AZlgvSlDJtME46uV0br0gENbwN4l5+mMKT4b9gJKqWtuL2zAIqajGJGuvbCbcAJUZqBg== + triple-beam@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/triple-beam/-/triple-beam-1.3.0.tgz#a595214c7298db8339eeeee083e4d10bd8cb8dd9" @@ -15002,6 +15069,14 @@ unist-builder@^4.0.0: dependencies: "@types/unist" "^3.0.0" +unist-util-find-after@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz#3fccc1b086b56f34c8b798e1ff90b5c54468e896" + integrity sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ== + dependencies: + "@types/unist" "^3.0.0" + unist-util-is "^6.0.0" + unist-util-is@^5.0.0: version "5.1.1" resolved "https://registry.yarnpkg.com/unist-util-is/-/unist-util-is-5.1.1.tgz#e8aece0b102fa9bc097b0fef8f870c496d4a6236"