Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest space 63/ingest knowledge base #64

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env.default
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ AUTH_ADMIN_PASSWORD=master-password
VECTOR_DB_HOST=localhost
VECTOR_DB_PORT=8765

CHUNK_SIZE=1000
CHUNK_OVERLAP=100
CHUNK_SIZE=9000
CHUNK_OVERLAP=500

SUMMARY_LENGTH=10000

Expand Down
50 changes: 50 additions & 0 deletions graphql/fragments/callout.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
fragment CalloutFields on Callout {
id
nameID
type
visibility
comments {
messagesCount
messages {
sender {
... on User {
profile {
url
displayName
}
}
... on VirtualContributor {
profile {
url
displayName
}
}
}
message
timestamp
}
}
framing {
id
profile {
...ProfileFields
}
}
contributions {
post {
id
nameID
profile {
...ProfileFields
}
}
link {
id
uri
profile {
...ProfileNoTagsetFields
}

}
}
}
6 changes: 6 additions & 0 deletions graphql/fragments/profile.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ fragment ProfileFields on Profile {
tagline
url
type
location {
city
country
postalCode
}

tagset {
tags
}
Expand Down
73 changes: 4 additions & 69 deletions graphql/fragments/space-ingest.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,7 @@ fragment SpaceIngest on Space {
nameID
type
profile {
description
displayName
tagline
url
location {
city
country
postalCode
}
tagset {
tags
}
references {
description
name
uri
}
visuals {
uri
name
}
...ProfileFields
}
context {
vision
Expand All @@ -32,54 +12,9 @@ fragment SpaceIngest on Space {
}

collaboration {
callouts {
id
nameID
type
visibility
comments {
messagesCount
messages {
sender {
... on User {
profile {
url
displayName
}
}
... on VirtualContributor {
profile {
url
displayName
}
}
}
message
timestamp
}
}
framing {
id
profile {
...ProfileFields
}
}
contributions {
post {
id
nameID
profile {
...ProfileFields
}
}
link {
id
uri
profile {
...ProfileNoTagsetFields
}

}
calloutsSet {
callouts {
...CalloutFields
}
}
}
Expand Down
15 changes: 15 additions & 0 deletions graphql/queries/knowledge-base-ingest.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
query knowledgeBaseIngest($knowledgeBaseID: UUID!){
lookup {
knowledgeBase(ID: $knowledgeBaseID) {
id
profile {
...ProfileFields
}
calloutsSet {
callouts {
...CalloutFields
}
}
}
}
}
15 changes: 8 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@alkemio/space-ingest",
"version": "0.10.0",
"name": "@alkemio/ingest-alkemio-data",
"version": "0.11.0",
"description": "",
"author": "Alkemio Foundation",
"private": true,
Expand Down Expand Up @@ -59,6 +59,7 @@
"graphql": "^16.6.0",
"graphql-upload": "^16.0.1",
"langchain": "^0.2.2",
"langsmith": "^0.1.66",
"mammoth": "^1.7.2",
"officeparser": "^4.1.1",
"pdf-parse": "^1.1.1",
Expand Down
2 changes: 2 additions & 0 deletions src/callout.handlers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export const baseHandler = async (

const result: Document[] = [
new Document({
id: documentId,
pageContent,
metadata: {
documentId,
Expand All @@ -63,6 +64,7 @@ export const baseHandler = async (
generateDocument(docLike);
result.push(
new Document({
id: documentId,
pageContent,
metadata: {
documentId,
Expand Down
2 changes: 1 addition & 1 deletion src/callout.handlers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Callout, CalloutType } from '../generated/graphql';
import { Document } from 'langchain/document';
import { baseHandler } from './base';
import { linkCollectionHandler } from './link.collection';
import { AlkemioCliClient } from 'src/graphql.client/AlkemioCliClient';
import { AlkemioCliClient } from '../graphql.client/AlkemioCliClient';

const handlersMap: Record<
CalloutType,
Expand Down
41 changes: 41 additions & 0 deletions src/data.readers/knowledge.base.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import generateDocument from '../generate.document';
import { Document } from 'langchain/document';
import { IngestBodyOfKnowledge } from '../event.bus/events/ingest.body.of.knowledge';
import { AlkemioCliClient } from '../graphql.client/AlkemioCliClient';
import { processCallouts } from '../process.callouts';
import { Callout } from '../generated/graphql';

export const embedKnowledgeBase = async (
event: IngestBodyOfKnowledge,
alkemioClient: AlkemioCliClient
) => {
const knowledgeBaseId = event.bodyOfKnowledgeId;
// make sure the service user has sufficient priviliges
const knowledgeBase = await alkemioClient.ingestKnowledgeBase(
knowledgeBaseId
);
const documents: Document[] = [];
const { documentId, source, pageContent, type, title } =
generateDocument(knowledgeBase);

documents.push(
new Document({
id: documentId,
pageContent,
metadata: {
documentId,
source,
type,
title,
},
})
);

const calloutDocs = await processCallouts(
(knowledgeBase.calloutsSet?.callouts || []) as Partial<Callout>[],
alkemioClient
);
documents.push(...calloutDocs);

return { bodyOfKnowledge: knowledgeBase, documents };
};
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { Document } from 'langchain/document';

import { CalloutVisibility, Callout, Space } from '../generated/graphql';
import { Space } from '../generated/graphql';

import logger from '../logger';
import generateDocument from '../generate.document';
import { handleCallout } from '../callout.handlers';
import { AlkemioCliClient } from '../graphql.client/AlkemioCliClient';
import { processCallouts } from '../process.callouts';

// recursive function
// first invocation is with [rootSpace]
// second invocation is with rootSpace.subspaces
Expand All @@ -21,6 +21,7 @@ export const processSpaceTree = async (
generateDocument(subspace);
documents.push(
new Document({
id: documentId,
pageContent,
metadata: {
documentId,
Expand All @@ -30,21 +31,11 @@ export const processSpaceTree = async (
},
})
);

for (let j = 0; j < (subspace.collaboration?.callouts || []).length; j++) {
const callout = (subspace.collaboration?.callouts || [])[j];
if (callout && callout.visibility === CalloutVisibility.Published) {
const document = await handleCallout(
callout as Partial<Callout>,
logger,
alkemioClient
);
// empty doc - nothing to do here
if (document) {
documents.push(...document);
}
}
}
const calloutDocs = await processCallouts(
subspace.collaboration?.calloutsSet?.callouts || [],
alkemioClient
);
documents.push(...calloutDocs);

// incoke recursively for the subspaces of the rootSpace
const subspacesDocs = await processSpaceTree(
Expand Down
23 changes: 23 additions & 0 deletions src/data.readers/space.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { Document } from 'langchain/document';

import { Space } from '../generated/graphql';

import { AlkemioCliClient } from '../graphql.client/AlkemioCliClient';
import { processSpaceTree } from './process.space.tree';
import { IngestBodyOfKnowledge } from 'src/event.bus/events/ingest.body.of.knowledge';
import { ReadResult } from './types';

export const embedSpace = async (
event: IngestBodyOfKnowledge,
alkemioClient: AlkemioCliClient
): Promise<ReadResult> => {
const spaceId = event.bodyOfKnowledgeId;
// make sure the service user has sufficient priviliges
const space = await alkemioClient.ingestSpace(spaceId);
valeksiev marked this conversation as resolved.
Show resolved Hide resolved
const documents: Document[] = await processSpaceTree(
[space as Partial<Space>],
alkemioClient
);
valeksiev marked this conversation as resolved.
Show resolved Hide resolved

return { bodyOfKnowledge: space, documents };
};
9 changes: 9 additions & 0 deletions src/data.readers/types.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
export declare type BodyOfKnowledgeReadResult = {
id: string;
profile: { displayName: string; url: string };
};

export declare type ReadResult = {
documents?: Document[];
bodyOfKnowledge?: BodyOfKnowledgeReadResult;
};
Loading