From 3ff6b3192c7e1c8533607194ae26892f3c7b1a92 Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Tue, 8 Oct 2024 09:45:06 +0300 Subject: [PATCH 1/8] @WIP generate document summaries --- package-lock.json | 303 +++++++++++++++++++++--- package.json | 12 +- src/callout.handlers/base.ts | 25 +- src/callout.handlers/link.collection.ts | 4 +- src/embed.ts | 14 ++ src/space.embed/embed.space.ts | 1 + src/summarize.ts | 13 + 7 files changed, 327 insertions(+), 45 deletions(-) create mode 100644 src/summarize.ts diff --git a/package-lock.json b/package-lock.json index 30c1cd7..1f216bf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@alkemio/space-ingest", - "version": "0.9.0", + "version": "0.9.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@alkemio/space-ingest", - "version": "0.9.0", + "version": "0.9.1", "license": "EUPL-1.2", "dependencies": { "@alkemio/client-lib": "^0.31.0", @@ -15,6 +15,8 @@ "@graphql-codegen/typescript-graphql-request": "^4.5.3", "@graphql-codegen/typescript-operations": "^2.5.3", "@langchain/community": "^0.2.4", + "@langchain/langgraph": "^0.2.8", + "@langchain/mistralai": "^0.1.1", "@types/graphql-upload": "^8.0.11", "amqplib": "^0.10.4", "chromadb": "^1.8.1", @@ -3202,6 +3204,39 @@ } } }, + "node_modules/@langchain/community/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/community/node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/community/node_modules/langchain": { "version": "0.2.3", "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.2.3.tgz", @@ -3447,20 +3482,20 @@ } }, "node_modules/@langchain/core": { - "version": "0.2.7", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.7.tgz", - "integrity": "sha512-FdFiNWhszFuUyAhYdY+l5DtPnAnWCAjXMnkLmUJ1J54NeUiUm7gy26Hnd4bkvaOQJ8ddHH/EX03ZwdoYfLv1jw==", + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.3.tgz", + "integrity": "sha512-WAtkmhbdl2T41qzimTzhb3pXCHQxO4onqxzPxgdf3KftQdTwLq0YYBDhozRMZLNAd/+cfH0ymZGaZSsnc9Ogsg==", + "peer": true, "dependencies": { "ansi-styles": "^5.0.0", "camelcase": "6", "decamelize": "1.2.0", "js-tiktoken": "^1.0.12", - "langsmith": "~0.1.30", - "ml-distance": "^4.0.0", + "langsmith": "^0.1.56", "mustache": "^4.2.0", "p-queue": "^6.6.2", "p-retry": "4", - "uuid": "^9.0.0", + "uuid": "^10.0.0", "zod": "^3.22.4", "zod-to-json-schema": "^3.22.3" }, @@ -3468,6 +3503,103 @@ "node": ">=18" } }, + "node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "peer": true, + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/langgraph": { + "version": "0.2.8", + "resolved": "https://registry.npmjs.org/@langchain/langgraph/-/langgraph-0.2.8.tgz", + "integrity": "sha512-sQ3NqwZzdvILeiYQQCDCBFj+FLd3oBfg2sxMo3e5g7vd5+zd/hpK5+JRTHbsMZte0PTAlTbQ5YbfCC2D6K9AVw==", + "dependencies": { + "@langchain/langgraph-checkpoint": "~0.0.6", + "double-ended-queue": "^2.1.0-0", + "uuid": "^10.0.0", + "zod": "^3.23.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.31 <0.4.0" + } + }, + "node_modules/@langchain/langgraph-checkpoint": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/@langchain/langgraph-checkpoint/-/langgraph-checkpoint-0.0.7.tgz", + "integrity": "sha512-D11m8143yn8O8FwinCxwxNF+1XFK/Au5rhp7ERBTJmaaojJk1N39TvSF/bvly7nNieKYh4hd0fqE6pnFGc228Q==", + "dependencies": { + "uuid": "^10.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.31 <0.4.0" + } + }, + "node_modules/@langchain/langgraph-checkpoint/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/langgraph/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/mistralai": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@langchain/mistralai/-/mistralai-0.1.1.tgz", + "integrity": "sha512-gnHdQRfn+iBReKD0u1nydGqHgVOjnKHpd0Q2qEN61ZuxiqFOOauWYkrbyml7tzcOdMv2vUAr5+pjpXip+ez59w==", + "dependencies": { + "@mistralai/mistralai": "^0.4.0", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, + "node_modules/@langchain/mistralai/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/openai": { "version": "0.1.3", "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.1.3.tgz", @@ -3483,6 +3615,39 @@ "node": ">=18" } }, + "node_modules/@langchain/openai/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/openai/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/textsplitters": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.0.3.tgz", @@ -3495,6 +3660,47 @@ "node": ">=18" } }, + "node_modules/@langchain/textsplitters/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/textsplitters/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@mistralai/mistralai": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/@mistralai/mistralai/-/mistralai-0.4.0.tgz", + "integrity": "sha512-KmFzNro1RKxIFh19J3osmUQhucefBBauMXN5fa9doG6dT9OHR/moBvvn+riVlR7c0AVfuxO8Dfa03AyLYYzbyg==", + "dependencies": { + "node-fetch": "^2.6.7" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -3907,9 +4113,9 @@ "dev": true }, "node_modules/@types/uuid": { - "version": "9.0.8", - "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", - "integrity": "sha512-jg+97EGIcY9AGHJJRaaPVgetKDsrTgbRjQ5Msgjh/DQKEFl0DtyRr/VCOyD1T2R1MNeWPK/u7JoGhlDZnKBAfA==" + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==" }, "node_modules/@types/wrap-ansi": { "version": "3.0.0", @@ -5639,6 +5845,11 @@ "url": "https://dotenvx.com" } }, + "node_modules/double-ended-queue": { + "version": "2.1.0-0", + "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz", + "integrity": "sha512-+BNfZ+deCo8hMNpDqDnvT+c0XpJ5cUa6mqYq89bho2Ifze4URTqRkcwR399hWoTrTkbZ/XJYDgP6rc7pRgffEQ==" + }, "node_modules/dset": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/dset/-/dset-3.1.3.tgz", @@ -7810,34 +8021,60 @@ } } }, + "node_modules/langchain/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/langchain/node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/langchainhub": { "version": "0.0.11", "resolved": "https://registry.npmjs.org/langchainhub/-/langchainhub-0.0.11.tgz", "integrity": "sha512-WnKI4g9kU2bHQP136orXr2bcRdgz9iiTBpTN0jWt9IlScUKnJBoD0aa2HOzHURQKeQDnt2JwqVmQ6Depf5uDLQ==" }, "node_modules/langsmith": { - "version": "0.1.32", - "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.1.32.tgz", - "integrity": "sha512-EUWHIH6fiOCGRYdzgwGoXwJxCMyUrL+bmUcxoVmkXoXoAGDOVinz8bqJLKbxotsQWqM64NKKsW85OTIutgNaMQ==", + "version": "0.1.60", + "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.1.60.tgz", + "integrity": "sha512-xchy/7PynZTkYXhismEYc+0XuDNDTzreKIyc/V3ohq4vnG79Iu+nPjDifvtICLHPCXTU8KSVno+PJX39XwhSjg==", "dependencies": { - "@types/uuid": "^9.0.1", + "@types/uuid": "^10.0.0", "commander": "^10.0.1", "p-queue": "^6.6.2", "p-retry": "4", - "uuid": "^9.0.0" + "semver": "^7.6.3", + "uuid": "^10.0.0" }, "peerDependencies": { - "@langchain/core": "*", - "langchain": "*", "openai": "*" }, "peerDependenciesMeta": { - "@langchain/core": { - "optional": true - }, - "langchain": { - "optional": true - }, "openai": { "optional": true } @@ -7851,6 +8088,18 @@ "node": ">=14" } }, + "node_modules/langsmith/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/levn": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", @@ -9590,9 +9839,9 @@ "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==" }, "node_modules/semver": { - "version": "7.6.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.2.tgz", - "integrity": "sha512-FNAIBWCx9qcRhoHcgcJ0gvU7SN1lYU2ZXuSfl04bSC5OpvDHFyJCjdNHomPXxjQlCBU67YW64PzY7/VIEH7F2w==", + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", "bin": { "semver": "bin/semver.js" }, diff --git a/package.json b/package.json index f760f01..c1510ec 100755 --- a/package.json +++ b/package.json @@ -45,17 +45,19 @@ }, "dependencies": { "@alkemio/client-lib": "^0.31.0", - "@graphql-codegen/typescript-graphql-request": "^4.5.3", - "@graphql-codegen/typescript-operations": "^2.5.3", - "@types/graphql-upload": "^8.0.11", - "graphql": "^16.6.0", - "graphql-upload": "^16.0.1", "@azure/openai": "^1.0.0-beta.12", "@dotenvx/dotenvx": "^0.35.1", + "@graphql-codegen/typescript-graphql-request": "^4.5.3", + "@graphql-codegen/typescript-operations": "^2.5.3", "@langchain/community": "^0.2.4", + "@langchain/langgraph": "^0.2.8", + "@langchain/mistralai": "^0.1.1", + "@types/graphql-upload": "^8.0.11", "amqplib": "^0.10.4", "chromadb": "^1.8.1", "file-type": "^19.0.0", + "graphql": "^16.6.0", + "graphql-upload": "^16.0.1", "langchain": "^0.2.2", "mammoth": "^1.7.2", "officeparser": "^4.1.1", diff --git a/src/callout.handlers/base.ts b/src/callout.handlers/base.ts index a7abc4f..612687c 100644 --- a/src/callout.handlers/base.ts +++ b/src/callout.handlers/base.ts @@ -1,5 +1,5 @@ import { Logger } from 'winston'; -import { Callout, CalloutContribution } from '../generated/graphql'; +import { Callout } from '../generated/graphql'; import { Document } from 'langchain/document'; import generateDocument from '../generate.document'; @@ -49,15 +49,16 @@ export const baseHandler = async ( ]; logger.info(`Generating documents for Callout (${documentId}) contributions`); - // extra loop but will do for now - callout.contributions - ?.map((contribution: Partial) => { - let docLike; - if (!!contribution.link) { - docLike = contribution.link; - } else if (!!contribution.post) { - docLike = contribution.post; - } + + for (const contribution of callout.contributions || []) { + let docLike; + if (!!contribution.link) { + docLike = contribution.link; + } else if (!!contribution.post) { + docLike = contribution.post; + } + + if (docLike) { const { pageContent, documentId, source, type, title } = generateDocument(docLike); result.push( @@ -71,8 +72,8 @@ export const baseHandler = async ( }, }) ); - }) - .join('\n'); + } + } logger.info( `Documents for Callout (${documentId}) generated. # of documents ${result.length}` diff --git a/src/callout.handlers/link.collection.ts b/src/callout.handlers/link.collection.ts index 609f115..d282622 100644 --- a/src/callout.handlers/link.collection.ts +++ b/src/callout.handlers/link.collection.ts @@ -56,7 +56,9 @@ const downloadDocument = async ( }; const fileLoaderFactories: { - [key in MimeType]?: (path: string) => BaseDocumentLoader; + [key in MimeType]?: ( + path: string + ) => BaseDocumentLoader | PDFLoader | DocxLoader; } = { [MimeType.Pdf]: (path: string) => new PDFLoader(path, { splitPages: false }), diff --git a/src/embed.ts b/src/embed.ts index ae4079b..9400a55 100644 --- a/src/embed.ts +++ b/src/embed.ts @@ -7,6 +7,7 @@ import { dbConnect } from './db.connect'; import { Metadata } from 'chromadb'; import { DocumentType } from './document.type'; import { BATCH_SIZE, CHUNK_OVERLAP, CHUNK_SIZE } from './constants'; +import { summarize } from './summarize'; const batch = (arr: T[], size: number): Array> => Array.from({ length: Math.ceil(arr.length / size) }, (_, i) => @@ -63,6 +64,19 @@ export default async ( doc.metadata.documentId }) of type ${doc.metadata.type}; # of chunks: ${splitted.length}` ); + if (doc.metadata.documentId === 'd6d95bc5-a0a0-4c1c-b8bd-c5c04d5ee17b') { + try { + await summarize(doc, splitted); + } catch (err) { + console.log(err); + return false; + throw err; + } + + console.log('\n\n\n'); + console.log(doc.pageContent); + console.log('\n\n\n'); + } splitted.forEach((chunk, chunkIndex) => { ids.push( diff --git a/src/space.embed/embed.space.ts b/src/space.embed/embed.space.ts index d329e7a..d22f469 100644 --- a/src/space.embed/embed.space.ts +++ b/src/space.embed/embed.space.ts @@ -71,6 +71,7 @@ export const embedSpace = async (event: IngestSpace) => { try { embeddingResult = await embed(space.id, documents, purpose); } catch (error) { + console.log(error); return setResultError( resultEvent, 'Failed to insert embeddings.', diff --git a/src/summarize.ts b/src/summarize.ts new file mode 100644 index 0000000..6805881 --- /dev/null +++ b/src/summarize.ts @@ -0,0 +1,13 @@ +import { ChatMistralAI, MistralAI } from '@langchain/mistralai'; +import { Document } from 'langchain/document'; + +export const summarize = async (_doc: Document, _chunks: Document[]) => { + const model = new ChatMistralAI({ + apiKey: 'TaecSvOH6awhFBJ5vjg0kSYiZ11f2dhN', + endpoint: + 'https://mistral-small-alkemio-serverless.swedencentral.inference.ai.azure.com', + maxRetries: 1, + }); + const resp = await model.invoke('tell me about Levski Sofia'); + console.log(resp); +}; From fe4d1d09fc8a2c0aafe4e75f54d06b3922a07585 Mon Sep 17 00:00:00 2001 From: Valentin Yanakiev Date: Mon, 21 Oct 2024 10:26:16 +0300 Subject: [PATCH 2/8] Remove redundant builds --- .../workflows/build-deploy-k8s-dev-azure.yml | 61 ------------------- .../build-deploy-k8s-sandbox-azure.yml | 59 ------------------ .../workflows/build-deploy-k8s-test-azure.yml | 60 ------------------ 3 files changed, 180 deletions(-) delete mode 100644 .github/workflows/build-deploy-k8s-dev-azure.yml delete mode 100644 .github/workflows/build-deploy-k8s-sandbox-azure.yml delete mode 100644 .github/workflows/build-deploy-k8s-test-azure.yml diff --git a/.github/workflows/build-deploy-k8s-dev-azure.yml b/.github/workflows/build-deploy-k8s-dev-azure.yml deleted file mode 100644 index 60b58b2..0000000 --- a/.github/workflows/build-deploy-k8s-dev-azure.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Build, Migrate & Deploy to Dev - -on: - push: - branches: [develop] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: ${{ secrets.CLUSTER_NAME }} - resource-group: ${{ secrets.RESOURCE_GROUP_K8S }} - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-virtual-contributor-ingest-space-deployment-dev.yml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret diff --git a/.github/workflows/build-deploy-k8s-sandbox-azure.yml b/.github/workflows/build-deploy-k8s-sandbox-azure.yml deleted file mode 100644 index 338a3c2..0000000 --- a/.github/workflows/build-deploy-k8s-sandbox-azure.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Build, Migrate & Deploy to Sandbox on Azure - -on: - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: k8s-sandbox - resource-group: res-grp-k8s-sandbox - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-genai-deployment-dev.yaml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret diff --git a/.github/workflows/build-deploy-k8s-test-azure.yml b/.github/workflows/build-deploy-k8s-test-azure.yml deleted file mode 100644 index 448bedd..0000000 --- a/.github/workflows/build-deploy-k8s-test-azure.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Build, Migrate & Deploy to Test on Azure - -on: - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: k8s-test - resource-group: res-grp-k8s-test - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-genai-deployment-dev.yaml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret From 7edb1401cb0f57acf3541524161a0ef4407858d5 Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Tue, 22 Oct 2024 22:28:09 +0300 Subject: [PATCH 3/8] implement document and BoK summaries --- .env.default | 2 + src/embed.ts | 48 ++++++++++++++------- src/logger.ts | 5 ++- src/summarize.ts | 13 ------ src/summarize/body.of.knowledge.ts | 39 +++++++++++++++++ src/summarize/document.ts | 39 +++++++++++++++++ src/summarize/graph.ts | 67 ++++++++++++++++++++++++++++++ 7 files changed, 183 insertions(+), 30 deletions(-) delete mode 100644 src/summarize.ts create mode 100644 src/summarize/body.of.knowledge.ts create mode 100644 src/summarize/document.ts create mode 100644 src/summarize/graph.ts diff --git a/.env.default b/.env.default index dac4925..103e9b8 100644 --- a/.env.default +++ b/.env.default @@ -24,4 +24,6 @@ VECTOR_DB_PORT=8765 CHUNK_SIZE=1000 CHUNK_OVERLAP=100 +SUMMARY_LENGTH=10000 + BATCH_SIZE=20 diff --git a/src/embed.ts b/src/embed.ts index 32dad43..06aa066 100644 --- a/src/embed.ts +++ b/src/embed.ts @@ -7,7 +7,9 @@ import { dbConnect } from './db.connect'; import { Metadata } from 'chromadb'; import { DocumentType } from './document.type'; import { BATCH_SIZE, CHUNK_OVERLAP, CHUNK_SIZE } from './constants'; -import { summarize } from './summarize'; +import { summarizeDocument } from './summarize/document'; +import { summariseBodyOfKnowledge } from './summarize/body.of.knowledge'; +import { summaryLength } from './summarize/graph'; const batch = (arr: T[], size: number): Array> => Array.from({ length: Math.ceil(arr.length / size) }, (_, i) => @@ -47,6 +49,8 @@ export default async ( const documents: string[] = []; const metadatas: Array = []; + const summaries: string[] = []; + logger.info(`Splitting documents for space: ${spaceID}`); for (let docIndex = 0; docIndex < docs.length; docIndex++) { @@ -64,29 +68,43 @@ export default async ( doc.metadata.documentId }) of type ${doc.metadata.type}; # of chunks: ${splitted.length}` ); - if (doc.metadata.documentId === 'd6d95bc5-a0a0-4c1c-b8bd-c5c04d5ee17b') { - try { - await summarize(doc, splitted); - } catch (err) { - console.log(err); - return false; - throw err; - } - - console.log('\n\n\n'); - console.log(doc.pageContent); - console.log('\n\n\n'); - } splitted.forEach((chunk, chunkIndex) => { ids.push( `${chunk.metadata.documentId}-${chunk.metadata.type}-chunk${chunkIndex}` ); documents.push(chunk.pageContent); - metadatas.push({ ...chunk.metadata, chunkIndex }); + metadatas.push({ ...chunk.metadata, embeddingType: 'chunk', chunkIndex }); }); + + if (doc.pageContent.length > summaryLength) { + try { + const documentSummary = await summarizeDocument(splitted); + ids.push(`${doc.metadata.documentId}-${doc.metadata.type}-summary`); + documents.push(documentSummary); + metadatas.push({ ...doc.metadata, embeddingType: 'summary' }); + + summaries.push(documentSummary); + } catch (err) { + console.log(err); + return false; + } + } } + const bokDescriptions = new Document({ pageContent: summaries.join('\n') }); + const bokChunks = await splitter.splitDocuments([bokDescriptions]); + + const bokSummary = await summariseBodyOfKnowledge(bokChunks); + ids.push('body-of-knowledge-summary'); + documents.push(bokSummary); + metadatas.push({ + documentId: spaceID, + soruce: 'spaceurl', + type: 'bodyOfKnowledgeSummary', + title: 'space name', + }); + logger.info('Connecting to Chroma...'); const client = dbConnect(); const heartbeat = await client.heartbeat(); diff --git a/src/logger.ts b/src/logger.ts index 02f5c0f..1bc96b2 100644 --- a/src/logger.ts +++ b/src/logger.ts @@ -5,8 +5,9 @@ const logger = winston.createLogger({ // format: winston.format.json(), format: winston.format.combine( winston.format.errors({ stack: true }), - winston.format.json(), - winston.format.prettyPrint() + // winston.format.json(), + winston.format.simple() + // winston.format.prettyPrint() ), defaultMeta: { service: 'space-ingest' }, diff --git a/src/summarize.ts b/src/summarize.ts deleted file mode 100644 index 6805881..0000000 --- a/src/summarize.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { ChatMistralAI, MistralAI } from '@langchain/mistralai'; -import { Document } from 'langchain/document'; - -export const summarize = async (_doc: Document, _chunks: Document[]) => { - const model = new ChatMistralAI({ - apiKey: 'TaecSvOH6awhFBJ5vjg0kSYiZ11f2dhN', - endpoint: - 'https://mistral-small-alkemio-serverless.swedencentral.inference.ai.azure.com', - maxRetries: 1, - }); - const resp = await model.invoke('tell me about Levski Sofia'); - console.log(resp); -}; diff --git a/src/summarize/body.of.knowledge.ts b/src/summarize/body.of.knowledge.ts new file mode 100644 index 0000000..76dd411 --- /dev/null +++ b/src/summarize/body.of.knowledge.ts @@ -0,0 +1,39 @@ +import { ChatPromptTemplate } from '@langchain/core/prompts'; +import { Document } from 'langchain/document'; +import { buildGraph } from './graph'; + +const summarizePrompt = ChatPromptTemplate.fromMessages([ + [ + 'system', + `You are tasked with concising summaries send by a user based entierly on the user input. While doing so + preserve as much information as possible like names, references titles, dates, etc.`, + ], + [ + 'human', + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}', + ], +]); +const refinePrompt = ChatPromptTemplate.fromMessages([ + [ + 'system', + `You are tasked with concising summaries send by a user based entierly on the user input. While doing so + preserve as much information as possible like names, references titles, dates, etc.`, + ], + [ + 'human', + `Produce a final detailed summary, no more than {summaryLength} characters. + Existing summary up to this point: + + {currentSummary} + + New context: {context} + + Given the new context, refine the original summary.`, + ], +]); + +export const summariseBodyOfKnowledge = async (chunks: Document[]) => { + const graph = buildGraph(summarizePrompt, refinePrompt); + const final = await graph.invoke({ chunks }); + return final.summary; +}; diff --git a/src/summarize/document.ts b/src/summarize/document.ts new file mode 100644 index 0000000..5f3e1e7 --- /dev/null +++ b/src/summarize/document.ts @@ -0,0 +1,39 @@ +import { ChatPromptTemplate } from '@langchain/core/prompts'; +import { Document } from 'langchain/document'; +import { buildGraph } from './graph'; + +const summarizePrompt = ChatPromptTemplate.fromMessages([ + // the system message is the same for both prompts but putting it into a variable and typing it is too hard... + [ + 'system', + `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. + In your summary preserve as much information as possible, including refereces, names of the participants, titles, dates, etc.`, + ], + [ + 'human', + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}', + ], +]); +const refinePrompt = ChatPromptTemplate.fromMessages([ + [ + 'system', + `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. + In your summary preserve as much information as possible, including refereces, names of the participants, titles, etc.`, + ], + [ + 'human', + `Produce a final detailed summary, no more than {summaryLength} characters. + Existing summary up to this point: + {currentSummary} + + New context: {context} + + Given the new context, refine the original summary.`, + ], +]); + +export const summarizeDocument = async (chunks: Document[]) => { + const graph = buildGraph(summarizePrompt, refinePrompt); + const final = await graph.invoke({ chunks }); + return final.summary; +}; diff --git a/src/summarize/graph.ts b/src/summarize/graph.ts new file mode 100644 index 0000000..889fe8c --- /dev/null +++ b/src/summarize/graph.ts @@ -0,0 +1,67 @@ +import { ChatMistralAI } from '@langchain/mistralai'; +import { Annotation, END, START, StateGraph } from '@langchain/langgraph'; +import { Document } from 'langchain/document'; +import { ChatPromptTemplate } from '@langchain/core/prompts'; + +export const summaryLength = parseInt(process.env.SUMMARY_LENGTH || '10000'); + +const apiKey = process.env.AZURE_MISTRAL_API_KEY; +const endpoint = process.env.AZURE_MISTRAL_ENDPOINT; + +const model = new ChatMistralAI({ + apiKey, + endpoint, + maxRetries: 1, +}); + +export const buildGraph = ( + summarizePrompt: ChatPromptTemplate, + refinePrompt: ChatPromptTemplate +) => { + const summaryChain = summarizePrompt.pipe(model); + const refineChain = refinePrompt.pipe(model); + + const SummarizeAnnotation = Annotation.Root({ + chunks: Annotation(), + index: Annotation(), + summary: Annotation(), + }); + + const initialSummary = async (input: typeof SummarizeAnnotation.State) => { + const context = input.chunks[0].pageContent; + const summary = await summaryChain.invoke({ context, summaryLength }); + return { summary: summary.content, index: 1 }; + }; + + const refineSummary = async (input: typeof SummarizeAnnotation.State) => { + const context = input.chunks[input.index].pageContent; + const currentSummary = input.summary; + const summary = await refineChain.invoke({ + currentSummary, + context, + summaryLength, + }); + + return { + summary: summary.content, + index: input.index + 1, + }; + }; + + const shouldRefine = (input: typeof SummarizeAnnotation.State) => { + if (input.index >= input.chunks.length) { + return END; + } + return 'refineSummary'; + }; + + const graph = new StateGraph(SummarizeAnnotation) + .addNode('initialSummary', initialSummary) + .addNode('refineSummary', refineSummary) + .addEdge(START, 'initialSummary') + .addConditionalEdges('initialSummary', shouldRefine, ['refineSummary', END]) + .addConditionalEdges('refineSummary', shouldRefine, ['refineSummary', END]) + .compile(); + + return graph; +}; From c17855e840eaacb8ea51f38f5655721a66846631 Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Tue, 22 Oct 2024 22:30:19 +0300 Subject: [PATCH 4/8] revert logger changes --- src/logger.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/logger.ts b/src/logger.ts index 1bc96b2..02f5c0f 100644 --- a/src/logger.ts +++ b/src/logger.ts @@ -5,9 +5,8 @@ const logger = winston.createLogger({ // format: winston.format.json(), format: winston.format.combine( winston.format.errors({ stack: true }), - // winston.format.json(), - winston.format.simple() - // winston.format.prettyPrint() + winston.format.json(), + winston.format.prettyPrint() ), defaultMeta: { service: 'space-ingest' }, From fea4a0a3555f904634dd86dfa35afb4fb6257d61 Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Tue, 22 Oct 2024 22:31:19 +0300 Subject: [PATCH 5/8] remove console.log --- src/space.embed/embed.space.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/space.embed/embed.space.ts b/src/space.embed/embed.space.ts index d22f469..c64ca26 100644 --- a/src/space.embed/embed.space.ts +++ b/src/space.embed/embed.space.ts @@ -71,7 +71,7 @@ export const embedSpace = async (event: IngestSpace) => { try { embeddingResult = await embed(space.id, documents, purpose); } catch (error) { - console.log(error); + logger.error(error); return setResultError( resultEvent, 'Failed to insert embeddings.', From 8084918c05a8e60dd2d02ebbfaee892a641ffe8d Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Wed, 23 Oct 2024 11:31:56 +0300 Subject: [PATCH 6/8] address coderabbit comments --- src/callout.handlers/base.ts | 4 ++-- src/embed.ts | 15 +++++++----- src/space.embed/embed.space.ts | 3 ++- src/summarize/body.of.knowledge.ts | 36 ++++++++++++++--------------- src/summarize/document.ts | 37 ++++++++++++++---------------- src/summarize/graph.ts | 12 +++++++++- 6 files changed, 58 insertions(+), 49 deletions(-) diff --git a/src/callout.handlers/base.ts b/src/callout.handlers/base.ts index 561e2d2..fcba0cc 100644 --- a/src/callout.handlers/base.ts +++ b/src/callout.handlers/base.ts @@ -52,9 +52,9 @@ export const baseHandler = async ( for (const contribution of callout.contributions || []) { let docLike; - if (!!contribution.link) { + if (contribution.link) { docLike = contribution.link; - } else if (!!contribution.post) { + } else if (contribution.post) { docLike = contribution.post; } diff --git a/src/embed.ts b/src/embed.ts index 06aa066..90b7a48 100644 --- a/src/embed.ts +++ b/src/embed.ts @@ -10,6 +10,7 @@ import { BATCH_SIZE, CHUNK_OVERLAP, CHUNK_SIZE } from './constants'; import { summarizeDocument } from './summarize/document'; import { summariseBodyOfKnowledge } from './summarize/body.of.knowledge'; import { summaryLength } from './summarize/graph'; +import { Space, Profile } from '@alkemio/client-lib'; const batch = (arr: T[], size: number): Array> => Array.from({ length: Math.ceil(arr.length / size) }, (_, i) => @@ -17,10 +18,11 @@ const batch = (arr: T[], size: number): Array> => ); export default async ( - spaceID: string, + space: Pick & { profile: Pick }, docs: Document[], purpose: SpaceIngestionPurpose ) => { + const spaceID = space.id; logger.defaultMeta.spaceId = spaceID; const endpoint = process.env.AZURE_OPENAI_ENDPOINT; @@ -86,23 +88,24 @@ export default async ( summaries.push(documentSummary); } catch (err) { - console.log(err); - return false; + logger.error(err); } + } else { + summaries.push(doc.pageContent); } } const bokDescriptions = new Document({ pageContent: summaries.join('\n') }); const bokChunks = await splitter.splitDocuments([bokDescriptions]); - const bokSummary = await summariseBodyOfKnowledge(bokChunks); ids.push('body-of-knowledge-summary'); documents.push(bokSummary); + metadatas.push({ documentId: spaceID, - soruce: 'spaceurl', + soruce: space.profile.url, type: 'bodyOfKnowledgeSummary', - title: 'space name', + title: space.profile?.displayName, }); logger.info('Connecting to Chroma...'); diff --git a/src/space.embed/embed.space.ts b/src/space.embed/embed.space.ts index c64ca26..a141aa0 100644 --- a/src/space.embed/embed.space.ts +++ b/src/space.embed/embed.space.ts @@ -26,6 +26,7 @@ const setResultError = ( ).getTime(); return result; }; + export const embedSpace = async (event: IngestSpace) => { const resultEvent = new IngestSpaceResult( event.spaceId, @@ -69,7 +70,7 @@ export const embedSpace = async (event: IngestSpace) => { ); let embeddingResult = false; try { - embeddingResult = await embed(space.id, documents, purpose); + embeddingResult = await embed(space, documents, purpose); } catch (error) { logger.error(error); return setResultError( diff --git a/src/summarize/body.of.knowledge.ts b/src/summarize/body.of.knowledge.ts index 76dd411..41d022c 100644 --- a/src/summarize/body.of.knowledge.ts +++ b/src/summarize/body.of.knowledge.ts @@ -1,26 +1,24 @@ -import { ChatPromptTemplate } from '@langchain/core/prompts'; +import { + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, + ChatPromptTemplate, +} from '@langchain/core/prompts'; import { Document } from 'langchain/document'; import { buildGraph } from './graph'; +const systemMessage = SystemMessagePromptTemplate.fromTemplate( + 'You are tasked with concising summaries based entirely on the user input. While doing so preserve as much information as possible like names, references titles, dates, etc.' +); + const summarizePrompt = ChatPromptTemplate.fromMessages([ - [ - 'system', - `You are tasked with concising summaries send by a user based entierly on the user input. While doing so - preserve as much information as possible like names, references titles, dates, etc.`, - ], - [ - 'human', - 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}', - ], + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}' + ), ]); const refinePrompt = ChatPromptTemplate.fromMessages([ - [ - 'system', - `You are tasked with concising summaries send by a user based entierly on the user input. While doing so - preserve as much information as possible like names, references titles, dates, etc.`, - ], - [ - 'human', + systemMessage, + HumanMessagePromptTemplate.fromTemplate( `Produce a final detailed summary, no more than {summaryLength} characters. Existing summary up to this point: @@ -28,8 +26,8 @@ const refinePrompt = ChatPromptTemplate.fromMessages([ New context: {context} - Given the new context, refine the original summary.`, - ], + Given the new context, refine the original summary.` + ), ]); export const summariseBodyOfKnowledge = async (chunks: Document[]) => { diff --git a/src/summarize/document.ts b/src/summarize/document.ts index 5f3e1e7..4db8137 100644 --- a/src/summarize/document.ts +++ b/src/summarize/document.ts @@ -1,35 +1,32 @@ -import { ChatPromptTemplate } from '@langchain/core/prompts'; +import { + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, + ChatPromptTemplate, +} from '@langchain/core/prompts'; import { Document } from 'langchain/document'; import { buildGraph } from './graph'; +const systemMessage = SystemMessagePromptTemplate.fromTemplate( + `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. + In your summary preserve as much information as possible, including references, names of the participants, titles, dates, etc.` +); const summarizePrompt = ChatPromptTemplate.fromMessages([ - // the system message is the same for both prompts but putting it into a variable and typing it is too hard... - [ - 'system', - `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. - In your summary preserve as much information as possible, including refereces, names of the participants, titles, dates, etc.`, - ], - [ - 'human', - 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}', - ], + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}' + ), ]); const refinePrompt = ChatPromptTemplate.fromMessages([ - [ - 'system', - `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. - In your summary preserve as much information as possible, including refereces, names of the participants, titles, etc.`, - ], - [ - 'human', + systemMessage, + HumanMessagePromptTemplate.fromTemplate( `Produce a final detailed summary, no more than {summaryLength} characters. Existing summary up to this point: {currentSummary} New context: {context} - Given the new context, refine the original summary.`, - ], + Given the new context, refine the original summary.` + ), ]); export const summarizeDocument = async (chunks: Document[]) => { diff --git a/src/summarize/graph.ts b/src/summarize/graph.ts index 889fe8c..552046e 100644 --- a/src/summarize/graph.ts +++ b/src/summarize/graph.ts @@ -3,11 +3,21 @@ import { Annotation, END, START, StateGraph } from '@langchain/langgraph'; import { Document } from 'langchain/document'; import { ChatPromptTemplate } from '@langchain/core/prompts'; -export const summaryLength = parseInt(process.env.SUMMARY_LENGTH || '10000'); +export const summaryLength = parseInt( + process.env.SUMMARY_LENGTH || '10000', + 10 +); const apiKey = process.env.AZURE_MISTRAL_API_KEY; const endpoint = process.env.AZURE_MISTRAL_ENDPOINT; +if (!apiKey) { + throw new Error('AZURE_MISTRAL_API_KEY environment variable is not set.'); +} +if (!endpoint) { + throw new Error('AZURE_MISTRAL_ENDPOINT environment variable is not set.'); +} + const model = new ChatMistralAI({ apiKey, endpoint, From 85ceb2f7529aa1adddf03ed8808b14bd2b617a59 Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Wed, 23 Oct 2024 11:32:40 +0300 Subject: [PATCH 7/8] bump version to 0.10.0 --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 97689f7..d106481 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "license": "EUPL-1.2", "dependencies": { "@alkemio/client-lib": "^0.31.0", diff --git a/package.json b/package.json index 533e014..c5ba1f7 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "description": "", "author": "Alkemio Foundation", "private": true, From 3051143cc392eed92e7c1c9a8c2b527cef4f569a Mon Sep 17 00:00:00 2001 From: Vladimir Aleksiev Date: Wed, 23 Oct 2024 12:22:21 +0300 Subject: [PATCH 8/8] fix typos and improve prompts --- src/embed.ts | 2 +- src/summarize/document.ts | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/embed.ts b/src/embed.ts index 90b7a48..29da975 100644 --- a/src/embed.ts +++ b/src/embed.ts @@ -103,7 +103,7 @@ export default async ( metadatas.push({ documentId: spaceID, - soruce: space.profile.url, + source: space.profile.url, type: 'bodyOfKnowledgeSummary', title: space.profile?.displayName, }); diff --git a/src/summarize/document.ts b/src/summarize/document.ts index 4db8137..d7d3bf5 100644 --- a/src/summarize/document.ts +++ b/src/summarize/document.ts @@ -6,8 +6,12 @@ import { import { Document } from 'langchain/document'; import { buildGraph } from './graph'; const systemMessage = SystemMessagePromptTemplate.fromTemplate( - `You are tasked with summarizing text documents that might include conversation transcripts, articles, novels and other. - In your summary preserve as much information as possible, including references, names of the participants, titles, dates, etc.` + `In your summary preserve as much information as possible, including: + - References and connections between documents + - Names of participants and their roles + - Titles, dates, and temporal relationships + - Key concepts and their relationships within the body of knowledge + Focus on maintaining the coherence of information across document boundaries.` ); const summarizePrompt = ChatPromptTemplate.fromMessages([