From 5ce3a65e320b6005973be804c597283b2ff002f2 Mon Sep 17 00:00:00 2001 From: Florian Dieminger Date: Mon, 29 Jan 2024 22:07:07 +0100 Subject: [PATCH] feat(scripts): use pg instead of supabase (#10337) * feat(scripts): use pg instead of supabase * fix error destructing * use pg uri to connect * log formatted updates --- .github/workflows/prod-build.yml | 3 +- .github/workflows/stage-build.yml | 3 +- libs/env/index.d.ts | 1 + libs/env/index.js | 1 + package.json | 2 + scripts/ai-help-macros.ts | 187 ++++++++++++++++++------------ yarn.lock | 100 ++++++++++++++++ 7 files changed, 218 insertions(+), 79 deletions(-) diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml index c258e46cb44d..f21b85c18a56 100644 --- a/.github/workflows/prod-build.yml +++ b/.github/workflows/prod-build.yml @@ -384,8 +384,7 @@ jobs: run: yarn ai-help-macros update-index env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + PG_URI: ${{ secrets.PG_URI }} - name: Slack Notification if: failure() diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml index 899061e6aae8..c74e6035d6c0 100644 --- a/.github/workflows/stage-build.yml +++ b/.github/workflows/stage-build.yml @@ -377,8 +377,7 @@ jobs: run: yarn ai-help-macros update-index env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + PG_URI: ${{ secrets.PG_URI }} - name: Slack Notification if: failure() diff --git a/libs/env/index.d.ts b/libs/env/index.d.ts index 540d10b8f90a..d8c5b3b57e4e 100644 --- a/libs/env/index.d.ts +++ b/libs/env/index.d.ts @@ -30,6 +30,7 @@ export const OFFLINE_CONTENT: boolean; export const FAKE_V1_API: boolean; export const SENTRY_DSN_BUILD: string; export const OPENAI_KEY: string; +export const PG_URI: string; export const SUPABASE_URL: string; export const SUPABASE_SERVICE_ROLE_KEY: string; export const SAMPLE_SIGN_KEY: Buffer; diff --git a/libs/env/index.js b/libs/env/index.js index accd2e289781..9593cd7327a5 100644 --- a/libs/env/index.js +++ b/libs/env/index.js @@ -168,6 +168,7 @@ export const FAKE_V1_API = JSON.parse(process.env.SERVER_FAKE_V1_API || false); // ---- export const OPENAI_KEY = process.env.OPENAI_KEY || ""; +export const PG_URI = process.env.PG_URI || ""; export const SUPABASE_URL = process.env.SUPABASE_URL || ""; export const SUPABASE_SERVICE_ROLE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || ""; diff --git a/package.json b/package.json index ca53e5af15dd..a9928caacab8 100644 --- a/package.json +++ b/package.json @@ -118,6 +118,8 @@ "open": "^10.0.3", "open-editor": "^4.1.1", "openai": "^4.26.0", + "pg": "^8.11.3", + "pgvector": "^0.1.7", "prism-svelte": "^0.5.0", "prismjs": "^1.29.0", "react-markdown": "^9.0.1", diff --git a/scripts/ai-help-macros.ts b/scripts/ai-help-macros.ts index 3098df6d669b..0682e03b0149 100644 --- a/scripts/ai-help-macros.ts +++ b/scripts/ai-help-macros.ts @@ -2,18 +2,14 @@ import { createHash } from "node:crypto"; import { readFile } from "node:fs/promises"; import caporal from "@caporal/core"; -import { SupabaseClient, createClient } from "@supabase/supabase-js"; +import pg from "pg"; +import pgvector from "pgvector/pg"; import { fdir } from "fdir"; import OpenAI from "openai"; import { load as cheerio } from "cheerio"; import { DocMetadata } from "../libs/types/document.js"; -import { - BUILD_OUT_ROOT, - OPENAI_KEY, - SUPABASE_SERVICE_ROLE_KEY, - SUPABASE_URL, -} from "../libs/env/index.js"; +import { BUILD_OUT_ROOT, OPENAI_KEY, PG_URI } from "../libs/env/index.js"; import { getBCDDataForPath, SimpleSupportStatementExtended, @@ -49,14 +45,18 @@ export async function updateEmbeddings( directory: string, updateFormatting: boolean ) { - if (!OPENAI_KEY || !SUPABASE_URL || !SUPABASE_SERVICE_ROLE_KEY) { - throw Error( - "Please set these environment variables: OPENAI_KEY, SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY" - ); + if (!OPENAI_KEY || !PG_URI) { + throw Error("Please set these environment variables: OPENAI_KEY, PG_URI"); } - // Supabase. - const supabaseClient = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY); + // Postgres. + const pgClient = new pg.Client({ + connectionString: PG_URI, + }); + + await pgClient.connect(); + await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector"); + await pgvector.registerType(pgClient); // Open AI. const openai = new OpenAI({ @@ -70,16 +70,9 @@ export async function updateEmbeddings( model: "text-embedding-ada-002", input, }); - } catch (e: any) { - const { - data: { - error: { message, type }, - }, - status, - statusText, - } = e.response; + } catch ({ error: { message, type }, status }: any) { console.error( - `[!] Failed to create embedding (${status} ${statusText}): ${type} - ${message}` + `[!] Failed to create embedding (${status}): ${type} - ${message}` ); // Try again with trimmed content. embeddingResponse = await openai.embeddings.create({ @@ -100,7 +93,7 @@ export async function updateEmbeddings( }; console.log(`Retrieving all indexed documents...`); - const existingDocs = await fetchAllExistingDocs(supabaseClient); + const existingDocs = await fetchAllExistingDocs(pgClient); console.log(`-> Done.`); const existingDocByUrl = new Map( @@ -143,7 +136,7 @@ export async function updateEmbeddings( } console.log( - `-> ${updates.length} of ${seenUrls.size} documents were changed (or added).` + `-> ${updates.length} (${formattingUpdates.length}) of ${seenUrls.size} documents were changed or added (or formatted).` ); const deletions: IndexedDoc[] = [...existingDocByUrl.entries()] .filter(([key]) => !seenUrls.has(key)) @@ -162,23 +155,41 @@ export async function updateEmbeddings( const { total_tokens, embedding } = await createEmbedding(text); // Create/update document record. - await supabaseClient - .from("mdn_doc_macro") - .upsert( - { - mdn_url, - title, - hash, - html, - token_count: total_tokens, - embedding, - text_hash, - }, - { onConflict: "mdn_url" } - ) - .select() - .single() - .throwOnError(); + const query = { + name: "upsert-embedding-doc", + text: ` + INSERT INTO mdn_doc_macro( + mdn_url, + title, + hash, + html, + token_count, + embedding, + text_hash + ) + VALUES($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (mdn_url) DO + UPDATE + SET mdn_url = $1, + title = $2, + hash = $3, + html = $4, + token_count = $5, + embedding = $6, + text_hash = $7 + `, + values: [ + mdn_url, + title, + hash, + html, + total_tokens, + pgvector.toSql(embedding), + text_hash, + ], + rowMode: "array", + }; + + await pgClient.query(query); } catch (err: any) { console.error(`!> [${mdn_url}] Failed to update document.`); const context = err?.response?.data ?? err?.response ?? err; @@ -192,20 +203,22 @@ export async function updateEmbeddings( ); // Create/update document record. - await supabaseClient - .from("mdn_doc_macro") - .upsert( - { - mdn_url, - title, - hash, - html, - }, - { onConflict: "mdn_url" } - ) - .select() - .single() - .throwOnError(); + const query = { + name: "upsert-doc", + text: ` + INSERT INTO mdn_doc_macro(mdn_url, title, hash, html) + VALUES($1, $2, $3, $4) ON CONFLICT (mdn_url) DO + UPDATE + SET mdn_url = $1, + title = $2, + hash = $3, + html = $4 + `, + values: [mdn_url, title, hash, html], + rowMode: "array", + }; + + await pgClient.query(query); } catch (err: any) { console.error(`!> [${mdn_url}] Failed to update document.`); const context = err?.response?.data ?? err?.response ?? err; @@ -219,14 +232,18 @@ export async function updateEmbeddings( console.log(`Applying deletions...`); for (const { id, mdn_url } of deletions) { console.log(`-> [${mdn_url}] Deleting indexed document...`); - await supabaseClient - .from("mdn_doc_macro") - .delete() - .eq("id", id) - .throwOnError(); + const query = { + name: "delete-doc", + text: `DELETE from mdn_doc_macro WHERE id = $1`, + values: [id], + rowMode: "array", + }; + + await pgClient.query(query); } console.log(`-> Done.`); } + pgClient.end(); } async function formatDocs(directory: string) { @@ -449,24 +466,44 @@ export function isNotSupportedAtAll(support: SimpleSupportStatement) { return !support.version_added && !hasLimitation(support); } -async function fetchAllExistingDocs(supabase: SupabaseClient) { +async function fetchAllExistingDocs(pgClient) { const PAGE_SIZE = 1000; - const selectDocs = () => - supabase - .from("mdn_doc_macro") - .select("id, mdn_url, title, hash, token_count, text_hash") - .order("id") - .limit(PAGE_SIZE); - - let { data } = await selectDocs().throwOnError(); - let allData = data; - while (data.length === PAGE_SIZE) { - const lastItem = data[data.length - 1]; - ({ data } = await selectDocs().gt("id", lastItem.id).throwOnError()); - allData = [...allData, ...data]; + const selectDocs = async (lastId) => { + const query = { + name: "fetch-all-doc", + text: ` + SELECT id, + mdn_url, + title, + hash, + token_count, + text_hash + from mdn_doc_macro + WHERE id > $1 + ORDER BY id ASC + LIMIT $2 + `, + values: [lastId, PAGE_SIZE], + rowMode: "array", + }; + const result = await pgClient.query(query); + return result.rows.map( + ([id, mdn_url, title, hash, token_count, text_hash]) => { + return { id, mdn_url, title, hash, token_count, text_hash }; + } + ); + }; + + const allDocs = []; + let docs = await selectDocs(0); + allDocs.push(...docs); + while (docs.length === PAGE_SIZE) { + const lastItem = docs[docs.length - 1]; + docs = await selectDocs(lastItem.id); + allDocs.push(...docs); } - return allData; + return allDocs; } // CLI. diff --git a/yarn.lock b/yarn.lock index 822b08469e38..26a4e64b1b29 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4357,6 +4357,11 @@ buffer-from@^1.0.0: resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5" integrity sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ== +buffer-writer@2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/buffer-writer/-/buffer-writer-2.0.0.tgz#ce7eb81a38f7829db09c873f2fbb792c0c98ec04" + integrity sha512-a7ZpuTZU1TRtnwyCNW3I5dc0wWNC3VR9S++Ewyk2HHZdrO3CQJqSpd+95Us590V6AL7JqUAH2IwZ/398PmNFgw== + buffer@^5.2.1, buffer@^5.5.0: version "5.7.1" resolved "https://registry.yarnpkg.com/buffer/-/buffer-5.7.1.tgz#ba62e7c13133053582197160851a8f648e99eed0" @@ -11326,6 +11331,11 @@ p-try@^2.0.0: resolved "https://registry.yarnpkg.com/p-try/-/p-try-2.2.0.tgz#cb2868540e313d61de58fafbe35ce9004d5540e6" integrity sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ== +packet-reader@1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/packet-reader/-/packet-reader-1.0.0.tgz#9238e5480dedabacfe1fe3f2771063f164157d74" + integrity sha512-HAKu/fG3HpHFO0AA8WE8q2g+gBJaZ9MG7fcKk+IJPLTGAD6Psw4443l+9DGRbOIh3/aXr7Phy0TjilYivJo5XQ== + param-case@^3.0.4: version "3.0.4" resolved "https://registry.yarnpkg.com/param-case/-/param-case-3.0.4.tgz#7d17fe4aa12bde34d4a77d91acfb6219caad01c5" @@ -11473,6 +11483,69 @@ performance-now@^2.1.0: resolved "https://registry.yarnpkg.com/performance-now/-/performance-now-2.1.0.tgz#6309f4e0e5fa913ec1c69307ae364b4b377c9e7b" integrity sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow== +pg-cloudflare@^1.1.1: + version "1.1.1" + resolved "https://registry.yarnpkg.com/pg-cloudflare/-/pg-cloudflare-1.1.1.tgz#e6d5833015b170e23ae819e8c5d7eaedb472ca98" + integrity sha512-xWPagP/4B6BgFO+EKz3JONXv3YDgvkbVrGw2mTo3D6tVDQRh1e7cqVGvyR3BE+eQgAvx1XhW/iEASj4/jCWl3Q== + +pg-connection-string@^2.6.2: + version "2.6.2" + resolved "https://registry.yarnpkg.com/pg-connection-string/-/pg-connection-string-2.6.2.tgz#713d82053de4e2bd166fab70cd4f26ad36aab475" + integrity sha512-ch6OwaeaPYcova4kKZ15sbJ2hKb/VP48ZD2gE7i1J+L4MspCtBMAx8nMgz7bksc7IojCIIWuEhHibSMFH8m8oA== + +pg-int8@1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/pg-int8/-/pg-int8-1.0.1.tgz#943bd463bf5b71b4170115f80f8efc9a0c0eb78c" + integrity sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw== + +pg-pool@^3.6.1: + version "3.6.1" + resolved "https://registry.yarnpkg.com/pg-pool/-/pg-pool-3.6.1.tgz#5a902eda79a8d7e3c928b77abf776b3cb7d351f7" + integrity sha512-jizsIzhkIitxCGfPRzJn1ZdcosIt3pz9Sh3V01fm1vZnbnCMgmGl5wvGGdNN2EL9Rmb0EcFoCkixH4Pu+sP9Og== + +pg-protocol@^1.6.0: + version "1.6.0" + resolved "https://registry.yarnpkg.com/pg-protocol/-/pg-protocol-1.6.0.tgz#4c91613c0315349363af2084608db843502f8833" + integrity sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q== + +pg-types@^2.1.0: + version "2.2.0" + resolved "https://registry.yarnpkg.com/pg-types/-/pg-types-2.2.0.tgz#2d0250d636454f7cfa3b6ae0382fdfa8063254a3" + integrity sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA== + dependencies: + pg-int8 "1.0.1" + postgres-array "~2.0.0" + postgres-bytea "~1.0.0" + postgres-date "~1.0.4" + postgres-interval "^1.1.0" + +pg@^8.11.3: + version "8.11.3" + resolved "https://registry.yarnpkg.com/pg/-/pg-8.11.3.tgz#d7db6e3fe268fcedd65b8e4599cda0b8b4bf76cb" + integrity sha512-+9iuvG8QfaaUrrph+kpF24cXkH1YOOUeArRNYIxq1viYHZagBxrTno7cecY1Fa44tJeZvaoG+Djpkc3JwehN5g== + dependencies: + buffer-writer "2.0.0" + packet-reader "1.0.0" + pg-connection-string "^2.6.2" + pg-pool "^3.6.1" + pg-protocol "^1.6.0" + pg-types "^2.1.0" + pgpass "1.x" + optionalDependencies: + pg-cloudflare "^1.1.1" + +pgpass@1.x: + version "1.0.5" + resolved "https://registry.yarnpkg.com/pgpass/-/pgpass-1.0.5.tgz#9b873e4a564bb10fa7a7dbd55312728d422a223d" + integrity sha512-FdW9r/jQZhSeohs1Z3sI1yxFQNFvMcnmfuj4WBMUTxOrAyLMaTcE1aAMBiTlbMNaXvBCQuVi0R7hd8udDSP7ug== + dependencies: + split2 "^4.1.0" + +pgvector@^0.1.7: + version "0.1.7" + resolved "https://registry.yarnpkg.com/pgvector/-/pgvector-0.1.7.tgz#0a170c85da8bae79d79cb09c1968d42d021489f5" + integrity sha512-hl1/Rvvu8iENi/0x4QcZh6o3bF2GWyyKIEu5GfXIzRhzhbOo7aQmoTPNWedRG1pziif+gOMTKKzHaJorgr8F0A== + picocolors@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c" @@ -12158,6 +12231,28 @@ postcss@^8.2.14, postcss@^8.4.23, postcss@^8.4.28, postcss@^8.4.32, postcss@^8.4 picocolors "^1.0.0" source-map-js "^1.0.2" +postgres-array@~2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/postgres-array/-/postgres-array-2.0.0.tgz#48f8fce054fbc69671999329b8834b772652d82e" + integrity sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA== + +postgres-bytea@~1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/postgres-bytea/-/postgres-bytea-1.0.0.tgz#027b533c0aa890e26d172d47cf9ccecc521acd35" + integrity sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w== + +postgres-date@~1.0.4: + version "1.0.7" + resolved "https://registry.yarnpkg.com/postgres-date/-/postgres-date-1.0.7.tgz#51bc086006005e5061c591cee727f2531bf641a8" + integrity sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q== + +postgres-interval@^1.1.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/postgres-interval/-/postgres-interval-1.2.0.tgz#b460c82cb1587507788819a06aa0fffdb3544695" + integrity sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ== + dependencies: + xtend "^4.0.0" + prelude-ls@^1.2.1: version "1.2.1" resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396" @@ -13650,6 +13745,11 @@ spdy@^4.0.2: select-hose "^2.0.0" spdy-transport "^3.0.0" +split2@^4.1.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/split2/-/split2-4.2.0.tgz#c9c5920904d148bab0b9f67145f245a86aadbfa4" + integrity sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg== + sprintf-js@~1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c"