diff --git a/.env.default b/.env.default index dac4925..103e9b8 100644 --- a/.env.default +++ b/.env.default @@ -24,4 +24,6 @@ VECTOR_DB_PORT=8765 CHUNK_SIZE=1000 CHUNK_OVERLAP=100 +SUMMARY_LENGTH=10000 + BATCH_SIZE=20 diff --git a/.github/workflows/build-deploy-k8s-dev-azure.yml b/.github/workflows/build-deploy-k8s-dev-azure.yml deleted file mode 100644 index 60b58b2..0000000 --- a/.github/workflows/build-deploy-k8s-dev-azure.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Build, Migrate & Deploy to Dev - -on: - push: - branches: [develop] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: ${{ secrets.CLUSTER_NAME }} - resource-group: ${{ secrets.RESOURCE_GROUP_K8S }} - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-virtual-contributor-ingest-space-deployment-dev.yml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret diff --git a/.github/workflows/build-deploy-k8s-sandbox-azure.yml b/.github/workflows/build-deploy-k8s-sandbox-azure.yml deleted file mode 100644 index 338a3c2..0000000 --- a/.github/workflows/build-deploy-k8s-sandbox-azure.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Build, Migrate & Deploy to Sandbox on Azure - -on: - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: k8s-sandbox - resource-group: res-grp-k8s-sandbox - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-genai-deployment-dev.yaml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret diff --git a/.github/workflows/build-deploy-k8s-test-azure.yml b/.github/workflows/build-deploy-k8s-test-azure.yml deleted file mode 100644 index 448bedd..0000000 --- a/.github/workflows/build-deploy-k8s-test-azure.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Build, Migrate & Deploy to Test on Azure - -on: - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login into ACR" - uses: azure/docker-login@v1.0.1 - with: - login-server: ${{ secrets.REGISTRY_LOGIN_SERVER }} - username: ${{ secrets.REGISTRY_USERNAME }} - password: ${{ secrets.REGISTRY_PASSWORD }} - - - name: "Build & Push image" - run: | - docker build -f Dockerfile . -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} -t ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:latest - docker push ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - - deploy: - runs-on: ubuntu-latest - steps: - - name: "Checkout GitHub Action" - uses: actions/checkout@v3.0.2 - - - name: "Login via Azure CLI" - uses: azure/login@v1.4.7 - with: - creds: ${{ secrets.AZURE_CRED_K8S_NEW }} - - - uses: Azure/aks-set-context@v3.2 - with: - cluster-name: k8s-test - resource-group: res-grp-k8s-test - - - uses: Azure/k8s-create-secret@v4.0 - with: - container-registry-url: ${{ secrets.REGISTRY_LOGIN_SERVER }} - container-registry-username: ${{ secrets.REGISTRY_USERNAME }} - container-registry-password: ${{ secrets.REGISTRY_PASSWORD }} - secret-name: alkemio-virtual-contributor-ingest-space-secret - - - uses: azure/setup-kubectl@v3.2 - with: - version: "v1.22.0" # default is latest stable, fixing it to a compatible version - id: install - - - uses: Azure/k8s-deploy@v4.10 - with: - manifests: | - manifests/25-genai-deployment-dev.yaml - images: | - ${{ secrets.REGISTRY_LOGIN_SERVER }}/alkemio-virtual-contributor-ingest-space:${{ github.sha }} - imagepullsecrets: | - alkemio-virtual-contributor-ingest-space-secret diff --git a/package-lock.json b/package-lock.json index 95a99cf..d106481 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "license": "EUPL-1.2", "dependencies": { "@alkemio/client-lib": "^0.31.0", @@ -15,6 +15,8 @@ "@graphql-codegen/typescript-graphql-request": "^4.5.3", "@graphql-codegen/typescript-operations": "^2.5.3", "@langchain/community": "^0.2.4", + "@langchain/langgraph": "^0.2.8", + "@langchain/mistralai": "^0.1.1", "@types/graphql-upload": "^8.0.11", "amqplib": "^0.10.4", "chromadb": "^1.8.1", @@ -3202,6 +3204,39 @@ } } }, + "node_modules/@langchain/community/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/community/node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/community/node_modules/langchain": { "version": "0.2.3", "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.2.3.tgz", @@ -3447,20 +3482,20 @@ } }, "node_modules/@langchain/core": { - "version": "0.2.7", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.7.tgz", - "integrity": "sha512-FdFiNWhszFuUyAhYdY+l5DtPnAnWCAjXMnkLmUJ1J54NeUiUm7gy26Hnd4bkvaOQJ8ddHH/EX03ZwdoYfLv1jw==", + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.3.tgz", + "integrity": "sha512-WAtkmhbdl2T41qzimTzhb3pXCHQxO4onqxzPxgdf3KftQdTwLq0YYBDhozRMZLNAd/+cfH0ymZGaZSsnc9Ogsg==", + "peer": true, "dependencies": { "ansi-styles": "^5.0.0", "camelcase": "6", "decamelize": "1.2.0", "js-tiktoken": "^1.0.12", - "langsmith": "~0.1.30", - "ml-distance": "^4.0.0", + "langsmith": "^0.1.56", "mustache": "^4.2.0", "p-queue": "^6.6.2", "p-retry": "4", - "uuid": "^9.0.0", + "uuid": "^10.0.0", "zod": "^3.22.4", "zod-to-json-schema": "^3.22.3" }, @@ -3468,6 +3503,103 @@ "node": ">=18" } }, + "node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "peer": true, + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/langgraph": { + "version": "0.2.8", + "resolved": "https://registry.npmjs.org/@langchain/langgraph/-/langgraph-0.2.8.tgz", + "integrity": "sha512-sQ3NqwZzdvILeiYQQCDCBFj+FLd3oBfg2sxMo3e5g7vd5+zd/hpK5+JRTHbsMZte0PTAlTbQ5YbfCC2D6K9AVw==", + "dependencies": { + "@langchain/langgraph-checkpoint": "~0.0.6", + "double-ended-queue": "^2.1.0-0", + "uuid": "^10.0.0", + "zod": "^3.23.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.31 <0.4.0" + } + }, + "node_modules/@langchain/langgraph-checkpoint": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/@langchain/langgraph-checkpoint/-/langgraph-checkpoint-0.0.7.tgz", + "integrity": "sha512-D11m8143yn8O8FwinCxwxNF+1XFK/Au5rhp7ERBTJmaaojJk1N39TvSF/bvly7nNieKYh4hd0fqE6pnFGc228Q==", + "dependencies": { + "uuid": "^10.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.31 <0.4.0" + } + }, + "node_modules/@langchain/langgraph-checkpoint/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/langgraph/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/mistralai": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@langchain/mistralai/-/mistralai-0.1.1.tgz", + "integrity": "sha512-gnHdQRfn+iBReKD0u1nydGqHgVOjnKHpd0Q2qEN61ZuxiqFOOauWYkrbyml7tzcOdMv2vUAr5+pjpXip+ez59w==", + "dependencies": { + "@mistralai/mistralai": "^0.4.0", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, + "node_modules/@langchain/mistralai/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/openai": { "version": "0.1.3", "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.1.3.tgz", @@ -3483,6 +3615,39 @@ "node": ">=18" } }, + "node_modules/@langchain/openai/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/openai/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@langchain/textsplitters": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.0.3.tgz", @@ -3495,6 +3660,47 @@ "node": ">=18" } }, + "node_modules/@langchain/textsplitters/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/textsplitters/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@mistralai/mistralai": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/@mistralai/mistralai/-/mistralai-0.4.0.tgz", + "integrity": "sha512-KmFzNro1RKxIFh19J3osmUQhucefBBauMXN5fa9doG6dT9OHR/moBvvn+riVlR7c0AVfuxO8Dfa03AyLYYzbyg==", + "dependencies": { + "node-fetch": "^2.6.7" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -3907,9 +4113,9 @@ "dev": true }, "node_modules/@types/uuid": { - "version": "9.0.8", - "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-9.0.8.tgz", - "integrity": "sha512-jg+97EGIcY9AGHJJRaaPVgetKDsrTgbRjQ5Msgjh/DQKEFl0DtyRr/VCOyD1T2R1MNeWPK/u7JoGhlDZnKBAfA==" + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==" }, "node_modules/@types/wrap-ansi": { "version": "3.0.0", @@ -5639,6 +5845,11 @@ "url": "https://dotenvx.com" } }, + "node_modules/double-ended-queue": { + "version": "2.1.0-0", + "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz", + "integrity": "sha512-+BNfZ+deCo8hMNpDqDnvT+c0XpJ5cUa6mqYq89bho2Ifze4URTqRkcwR399hWoTrTkbZ/XJYDgP6rc7pRgffEQ==" + }, "node_modules/dset": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/dset/-/dset-3.1.3.tgz", @@ -7810,34 +8021,60 @@ } } }, + "node_modules/langchain/node_modules/@langchain/core": { + "version": "0.2.34", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.34.tgz", + "integrity": "sha512-Hkveq1UcOjUj1DVn5erbqElyRj1t04NORSuSIZAJCtPO7EDkIqomjAarJ5+I5NUpQeIONgbOdnY9TkJ6cKUSVA==", + "dependencies": { + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.1.56-rc.1", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/langchain/node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/langchainhub": { "version": "0.0.11", "resolved": "https://registry.npmjs.org/langchainhub/-/langchainhub-0.0.11.tgz", "integrity": "sha512-WnKI4g9kU2bHQP136orXr2bcRdgz9iiTBpTN0jWt9IlScUKnJBoD0aa2HOzHURQKeQDnt2JwqVmQ6Depf5uDLQ==" }, "node_modules/langsmith": { - "version": "0.1.32", - "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.1.32.tgz", - "integrity": "sha512-EUWHIH6fiOCGRYdzgwGoXwJxCMyUrL+bmUcxoVmkXoXoAGDOVinz8bqJLKbxotsQWqM64NKKsW85OTIutgNaMQ==", + "version": "0.1.60", + "resolved": "https://registry.npmjs.org/langsmith/-/langsmith-0.1.60.tgz", + "integrity": "sha512-xchy/7PynZTkYXhismEYc+0XuDNDTzreKIyc/V3ohq4vnG79Iu+nPjDifvtICLHPCXTU8KSVno+PJX39XwhSjg==", "dependencies": { - "@types/uuid": "^9.0.1", + "@types/uuid": "^10.0.0", "commander": "^10.0.1", "p-queue": "^6.6.2", "p-retry": "4", - "uuid": "^9.0.0" + "semver": "^7.6.3", + "uuid": "^10.0.0" }, "peerDependencies": { - "@langchain/core": "*", - "langchain": "*", "openai": "*" }, "peerDependenciesMeta": { - "@langchain/core": { - "optional": true - }, - "langchain": { - "optional": true - }, "openai": { "optional": true } @@ -7851,6 +8088,18 @@ "node": ">=14" } }, + "node_modules/langsmith/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/levn": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", @@ -9590,9 +9839,9 @@ "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==" }, "node_modules/semver": { - "version": "7.6.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.2.tgz", - "integrity": "sha512-FNAIBWCx9qcRhoHcgcJ0gvU7SN1lYU2ZXuSfl04bSC5OpvDHFyJCjdNHomPXxjQlCBU67YW64PzY7/VIEH7F2w==", + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", "bin": { "semver": "bin/semver.js" }, diff --git a/package.json b/package.json index 6fe7636..c5ba1f7 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@alkemio/space-ingest", - "version": "0.9.2", + "version": "0.10.0", "description": "", "author": "Alkemio Foundation", "private": true, @@ -45,17 +45,19 @@ }, "dependencies": { "@alkemio/client-lib": "^0.31.0", - "@graphql-codegen/typescript-graphql-request": "^4.5.3", - "@graphql-codegen/typescript-operations": "^2.5.3", - "@types/graphql-upload": "^8.0.11", - "graphql": "^16.6.0", - "graphql-upload": "^16.0.1", "@azure/openai": "^1.0.0-beta.12", "@dotenvx/dotenvx": "^0.35.1", + "@graphql-codegen/typescript-graphql-request": "^4.5.3", + "@graphql-codegen/typescript-operations": "^2.5.3", "@langchain/community": "^0.2.4", + "@langchain/langgraph": "^0.2.8", + "@langchain/mistralai": "^0.1.1", + "@types/graphql-upload": "^8.0.11", "amqplib": "^0.10.4", "chromadb": "^1.8.1", "file-type": "^19.0.0", + "graphql": "^16.6.0", + "graphql-upload": "^16.0.1", "langchain": "^0.2.2", "mammoth": "^1.7.2", "officeparser": "^4.1.1", diff --git a/src/callout.handlers/base.ts b/src/callout.handlers/base.ts index 64e4846..fcba0cc 100644 --- a/src/callout.handlers/base.ts +++ b/src/callout.handlers/base.ts @@ -49,6 +49,7 @@ export const baseHandler = async ( ]; logger.info(`Generating documents for Callout (${documentId}) contributions`); + for (const contribution of callout.contributions || []) { let docLike; if (contribution.link) { @@ -56,6 +57,7 @@ export const baseHandler = async ( } else if (contribution.post) { docLike = contribution.post; } + if (docLike) { const { pageContent, documentId, source, type, title } = generateDocument(docLike); diff --git a/src/callout.handlers/link.collection.ts b/src/callout.handlers/link.collection.ts index 609f115..d282622 100644 --- a/src/callout.handlers/link.collection.ts +++ b/src/callout.handlers/link.collection.ts @@ -56,7 +56,9 @@ const downloadDocument = async ( }; const fileLoaderFactories: { - [key in MimeType]?: (path: string) => BaseDocumentLoader; + [key in MimeType]?: ( + path: string + ) => BaseDocumentLoader | PDFLoader | DocxLoader; } = { [MimeType.Pdf]: (path: string) => new PDFLoader(path, { splitPages: false }), diff --git a/src/embed.ts b/src/embed.ts index 43bcd6f..29da975 100644 --- a/src/embed.ts +++ b/src/embed.ts @@ -7,6 +7,10 @@ import { dbConnect } from './db.connect'; import { Metadata } from 'chromadb'; import { DocumentType } from './document.type'; import { BATCH_SIZE, CHUNK_OVERLAP, CHUNK_SIZE } from './constants'; +import { summarizeDocument } from './summarize/document'; +import { summariseBodyOfKnowledge } from './summarize/body.of.knowledge'; +import { summaryLength } from './summarize/graph'; +import { Space, Profile } from '@alkemio/client-lib'; const batch = (arr: T[], size: number): Array> => Array.from({ length: Math.ceil(arr.length / size) }, (_, i) => @@ -14,10 +18,11 @@ const batch = (arr: T[], size: number): Array> => ); export default async ( - spaceID: string, + space: Pick & { profile: Pick }, docs: Document[], purpose: SpaceIngestionPurpose ) => { + const spaceID = space.id; logger.defaultMeta.spaceId = spaceID; const endpoint = process.env.AZURE_OPENAI_ENDPOINT; @@ -46,6 +51,8 @@ export default async ( const documents: string[] = []; const metadatas: Array = []; + const summaries: string[] = []; + logger.info(`Splitting documents for space: ${spaceID}`); for (let docIndex = 0; docIndex < docs.length; docIndex++) { @@ -69,10 +76,38 @@ export default async ( `${chunk.metadata.documentId}-${chunk.metadata.type}-chunk${chunkIndex}` ); documents.push(chunk.pageContent); - metadatas.push({ ...chunk.metadata, chunkIndex }); + metadatas.push({ ...chunk.metadata, embeddingType: 'chunk', chunkIndex }); }); + + if (doc.pageContent.length > summaryLength) { + try { + const documentSummary = await summarizeDocument(splitted); + ids.push(`${doc.metadata.documentId}-${doc.metadata.type}-summary`); + documents.push(documentSummary); + metadatas.push({ ...doc.metadata, embeddingType: 'summary' }); + + summaries.push(documentSummary); + } catch (err) { + logger.error(err); + } + } else { + summaries.push(doc.pageContent); + } } + const bokDescriptions = new Document({ pageContent: summaries.join('\n') }); + const bokChunks = await splitter.splitDocuments([bokDescriptions]); + const bokSummary = await summariseBodyOfKnowledge(bokChunks); + ids.push('body-of-knowledge-summary'); + documents.push(bokSummary); + + metadatas.push({ + documentId: spaceID, + source: space.profile.url, + type: 'bodyOfKnowledgeSummary', + title: space.profile?.displayName, + }); + logger.info('Connecting to Chroma...'); const client = dbConnect(); const heartbeat = await client.heartbeat(); diff --git a/src/space.embed/embed.space.ts b/src/space.embed/embed.space.ts index d329e7a..a141aa0 100644 --- a/src/space.embed/embed.space.ts +++ b/src/space.embed/embed.space.ts @@ -26,6 +26,7 @@ const setResultError = ( ).getTime(); return result; }; + export const embedSpace = async (event: IngestSpace) => { const resultEvent = new IngestSpaceResult( event.spaceId, @@ -69,8 +70,9 @@ export const embedSpace = async (event: IngestSpace) => { ); let embeddingResult = false; try { - embeddingResult = await embed(space.id, documents, purpose); + embeddingResult = await embed(space, documents, purpose); } catch (error) { + logger.error(error); return setResultError( resultEvent, 'Failed to insert embeddings.', diff --git a/src/summarize/body.of.knowledge.ts b/src/summarize/body.of.knowledge.ts new file mode 100644 index 0000000..41d022c --- /dev/null +++ b/src/summarize/body.of.knowledge.ts @@ -0,0 +1,37 @@ +import { + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, + ChatPromptTemplate, +} from '@langchain/core/prompts'; +import { Document } from 'langchain/document'; +import { buildGraph } from './graph'; + +const systemMessage = SystemMessagePromptTemplate.fromTemplate( + 'You are tasked with concising summaries based entirely on the user input. While doing so preserve as much information as possible like names, references titles, dates, etc.' +); + +const summarizePrompt = ChatPromptTemplate.fromMessages([ + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}' + ), +]); +const refinePrompt = ChatPromptTemplate.fromMessages([ + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + `Produce a final detailed summary, no more than {summaryLength} characters. + Existing summary up to this point: + + {currentSummary} + + New context: {context} + + Given the new context, refine the original summary.` + ), +]); + +export const summariseBodyOfKnowledge = async (chunks: Document[]) => { + const graph = buildGraph(summarizePrompt, refinePrompt); + const final = await graph.invoke({ chunks }); + return final.summary; +}; diff --git a/src/summarize/document.ts b/src/summarize/document.ts new file mode 100644 index 0000000..d7d3bf5 --- /dev/null +++ b/src/summarize/document.ts @@ -0,0 +1,40 @@ +import { + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, + ChatPromptTemplate, +} from '@langchain/core/prompts'; +import { Document } from 'langchain/document'; +import { buildGraph } from './graph'; +const systemMessage = SystemMessagePromptTemplate.fromTemplate( + `In your summary preserve as much information as possible, including: + - References and connections between documents + - Names of participants and their roles + - Titles, dates, and temporal relationships + - Key concepts and their relationships within the body of knowledge + Focus on maintaining the coherence of information across document boundaries.` +); + +const summarizePrompt = ChatPromptTemplate.fromMessages([ + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + 'Write a detailed summary, no more than {summaryLength} characters of the following: {context}' + ), +]); +const refinePrompt = ChatPromptTemplate.fromMessages([ + systemMessage, + HumanMessagePromptTemplate.fromTemplate( + `Produce a final detailed summary, no more than {summaryLength} characters. + Existing summary up to this point: + {currentSummary} + + New context: {context} + + Given the new context, refine the original summary.` + ), +]); + +export const summarizeDocument = async (chunks: Document[]) => { + const graph = buildGraph(summarizePrompt, refinePrompt); + const final = await graph.invoke({ chunks }); + return final.summary; +}; diff --git a/src/summarize/graph.ts b/src/summarize/graph.ts new file mode 100644 index 0000000..552046e --- /dev/null +++ b/src/summarize/graph.ts @@ -0,0 +1,77 @@ +import { ChatMistralAI } from '@langchain/mistralai'; +import { Annotation, END, START, StateGraph } from '@langchain/langgraph'; +import { Document } from 'langchain/document'; +import { ChatPromptTemplate } from '@langchain/core/prompts'; + +export const summaryLength = parseInt( + process.env.SUMMARY_LENGTH || '10000', + 10 +); + +const apiKey = process.env.AZURE_MISTRAL_API_KEY; +const endpoint = process.env.AZURE_MISTRAL_ENDPOINT; + +if (!apiKey) { + throw new Error('AZURE_MISTRAL_API_KEY environment variable is not set.'); +} +if (!endpoint) { + throw new Error('AZURE_MISTRAL_ENDPOINT environment variable is not set.'); +} + +const model = new ChatMistralAI({ + apiKey, + endpoint, + maxRetries: 1, +}); + +export const buildGraph = ( + summarizePrompt: ChatPromptTemplate, + refinePrompt: ChatPromptTemplate +) => { + const summaryChain = summarizePrompt.pipe(model); + const refineChain = refinePrompt.pipe(model); + + const SummarizeAnnotation = Annotation.Root({ + chunks: Annotation(), + index: Annotation(), + summary: Annotation(), + }); + + const initialSummary = async (input: typeof SummarizeAnnotation.State) => { + const context = input.chunks[0].pageContent; + const summary = await summaryChain.invoke({ context, summaryLength }); + return { summary: summary.content, index: 1 }; + }; + + const refineSummary = async (input: typeof SummarizeAnnotation.State) => { + const context = input.chunks[input.index].pageContent; + const currentSummary = input.summary; + const summary = await refineChain.invoke({ + currentSummary, + context, + summaryLength, + }); + + return { + summary: summary.content, + index: input.index + 1, + }; + }; + + const shouldRefine = (input: typeof SummarizeAnnotation.State) => { + if (input.index >= input.chunks.length) { + return END; + } + return 'refineSummary'; + }; + + const graph = new StateGraph(SummarizeAnnotation) + .addNode('initialSummary', initialSummary) + .addNode('refineSummary', refineSummary) + .addEdge(START, 'initialSummary') + .addConditionalEdges('initialSummary', shouldRefine, ['refineSummary', END]) + .addConditionalEdges('refineSummary', shouldRefine, ['refineSummary', END]) + .compile(); + + return graph; +};