Skip to content

Commit 557a0eb

Browse files
committed
implemented functionality to optionally provide docs or splitDocs to data retriever
1 parent 6cd321f commit 557a0eb

File tree

1 file changed

+44
-36
lines changed

1 file changed

+44
-36
lines changed

src/rag/data-retrievers/data-retrievers.ts

+44-36
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
1-
import { MemoryVectorStore } from "langchain/vectorstores/memory";
2-
import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
3-
import { formatDocumentsAsString } from "langchain/util/document";
4-
import { Runnable, RunnableConfig } from "@langchain/core/runnables";
5-
import { EmbeddingsInterface } from "@langchain/core/embeddings";
1+
import {MemoryVectorStore} from 'langchain/vectorstores/memory';
2+
import {GoogleGenerativeAIEmbeddings} from '@langchain/google-genai';
3+
import {formatDocumentsAsString} from 'langchain/util/document';
4+
import {Runnable, RunnableConfig} from '@langchain/core/runnables';
5+
import {EmbeddingsInterface} from '@langchain/core/embeddings';
66
import {
77
VectorStore,
88
VectorStoreRetrieverInput,
9-
} from "@langchain/core/vectorstores";
9+
} from '@langchain/core/vectorstores';
10+
import {Document} from 'langchain/document';
1011
import {
1112
CSVLoaderOptions,
1213
JSONLoaderKeysToInclude,
1314
PDFLoaderOptions,
1415
SupportedDataLoaderTypes,
1516
getDocs,
16-
} from "../data-loaders/data-loaders";
17+
} from '../data-loaders/data-loaders';
1718
import {
1819
ChunkingConfig,
1920
DataSplitterConfig,
2021
SupportedDataSplitterTypes,
2122
runDataSplitter,
22-
} from "../data-splitters/data-splitters";
23-
import { GOOGLE_GENAI_EMBEDDING_MODELS } from "../data-embeddings/embedding-models";
24-
import { getEnvironmentVariable } from "../../utils/utils";
23+
} from '../data-splitters/data-splitters';
24+
import {GOOGLE_GENAI_EMBEDDING_MODELS} from '../data-embeddings/embedding-models';
25+
import {getEnvironmentVariable} from '../../utils/utils';
2526

2627
/**
2728
* Type denoting a retriever that retrieves text data.
@@ -41,6 +42,8 @@ export type RetrievalOptions =
4142
* Represents the configuration for the retriever when generating embeddings.
4243
* @property {SupportedDataLoaderTypes} dataType - The type of data loader to use.
4344
* @property {string} filePath - The path to the file containing the data.
45+
* @property {Document<Record<string, string>>[]} [docs] - Optional: Provide an array containing LangChain document objects for the data.
46+
* @property {Document<Record<string, unknown>>[]} [splitDocs] - Optional: Provide an array containing LangChain document objects for the split data.
4447
* @property {JSONLoaderKeysToInclude} [jsonLoaderKeysToInclude] - The keys to include when loading JSON data.
4548
* @property {CSVLoaderOptions} [csvLoaderOptions] - The options for loading CSV data.
4649
* @property {PDFLoaderOptions} [pdfLoaderOptions] - The options for loading PDF data.
@@ -55,6 +58,8 @@ export type RetrievalOptions =
5558
export type RetrieverConfigGeneratingEmbeddings = {
5659
dataType: SupportedDataLoaderTypes;
5760
filePath: string;
61+
docs?: Document<Record<string, string>>[];
62+
splitDocs?: Document<Record<string, unknown>>[];
5863
jsonLoaderKeysToInclude?: JSONLoaderKeysToInclude;
5964
csvLoaderOptions?: CSVLoaderOptions;
6065
pdfLoaderOptions?: PDFLoaderOptions;
@@ -91,12 +96,12 @@ export type RetrieverConfig =
9196
* Task type for embedding content.
9297
*/
9398
export enum TaskType {
94-
TASK_TYPE_UNSPECIFIED = "TASK_TYPE_UNSPECIFIED",
95-
RETRIEVAL_QUERY = "RETRIEVAL_QUERY",
96-
RETRIEVAL_DOCUMENT = "RETRIEVAL_DOCUMENT",
97-
SEMANTIC_SIMILARITY = "SEMANTIC_SIMILARITY",
98-
CLASSIFICATION = "CLASSIFICATION",
99-
CLUSTERING = "CLUSTERING",
99+
TASK_TYPE_UNSPECIFIED = 'TASK_TYPE_UNSPECIFIED',
100+
RETRIEVAL_QUERY = 'RETRIEVAL_QUERY',
101+
RETRIEVAL_DOCUMENT = 'RETRIEVAL_DOCUMENT',
102+
SEMANTIC_SIMILARITY = 'SEMANTIC_SIMILARITY',
103+
CLASSIFICATION = 'CLASSIFICATION',
104+
CLUSTERING = 'CLUSTERING',
100105
}
101106

102107
/**
@@ -113,19 +118,19 @@ export const getAppropriateDataSplitter = (
113118
defaultSplitterConfig?: DataSplitterConfig;
114119
} => {
115120
switch (dataType) {
116-
case "csv":
117-
case "json":
121+
case 'csv':
122+
case 'json':
118123
return {
119-
defaultDataSplitterType: "character",
124+
defaultDataSplitterType: 'character',
120125
defaultSplitterConfig: {
121126
textSplitterConfig: {
122-
separators: ["\n"],
127+
separators: ['\n'],
123128
},
124129
},
125130
};
126131
default:
127132
return {
128-
defaultDataSplitterType: "text",
133+
defaultDataSplitterType: 'text',
129134
};
130135
}
131136
};
@@ -152,7 +157,7 @@ export const getDataRetriever = async (
152157
// vector store must be provided
153158
if (!config.vectorStore)
154159
throw new Error(
155-
"Vector store must be provided when not generating embeddings"
160+
'Vector store must be provided when not generating embeddings'
156161
);
157162
// return retriever
158163
else
@@ -164,36 +169,39 @@ export const getDataRetriever = async (
164169
// if generating embeddings, data type must be provided
165170
if (!config.dataType) {
166171
throw new Error(
167-
"Data type and file path must be provided when generating embeddings"
172+
'Data type and file path must be provided when generating embeddings'
168173
);
169174
}
170175

171176
// if generating embeddings, file path must be provided
172-
if (!config.filePath || config.filePath === "") {
173-
throw new Error("Invalid file path. File path must be provided");
177+
if (!config.filePath || config.filePath === '') {
178+
throw new Error('Invalid file path. File path must be provided');
174179
}
175180

176181
try {
177182
// Retrieve the documents from the specified file path
178-
const docs = await getDocs(config.dataType, config.filePath);
183+
const docs: Document<Record<string, string>>[] =
184+
config.docs ?? (await getDocs(config.dataType, config.filePath));
179185

180-
const { defaultDataSplitterType, defaultSplitterConfig } =
186+
const {defaultDataSplitterType, defaultSplitterConfig} =
181187
getAppropriateDataSplitter(config.dataType);
182188

183189
// Split the retrieved documents into chunks using the data splitter
184-
const splitDocs = await runDataSplitter({
185-
docs,
186-
dataSplitterType: config.dataSplitterType ?? defaultDataSplitterType,
187-
chunkingConfig: config.chunkingConfig ?? defaultChunkingConfig,
188-
splitterConfig: config.splitterConfig ?? defaultSplitterConfig,
189-
});
190+
const splitDocs: Document<Record<string, unknown>>[] =
191+
config.splitDocs ??
192+
(await runDataSplitter({
193+
docs,
194+
dataSplitterType: config.dataSplitterType ?? defaultDataSplitterType,
195+
chunkingConfig: config.chunkingConfig ?? defaultChunkingConfig,
196+
splitterConfig: config.splitterConfig ?? defaultSplitterConfig,
197+
}));
190198

191199
// embedding model - if not provided, use the default Google Generative AI Embeddings model
192200
const embeddings: EmbeddingsInterface =
193201
config.embeddingModel ??
194202
new GoogleGenerativeAIEmbeddings({
195-
apiKey: getEnvironmentVariable("GOOGLE_GENAI_API_KEY"),
196-
model: GOOGLE_GENAI_EMBEDDING_MODELS["text-embedding-004"].name,
203+
apiKey: getEnvironmentVariable('GOOGLE_GENAI_API_KEY'),
204+
model: GOOGLE_GENAI_EMBEDDING_MODELS['text-embedding-004'].name,
197205
taskType: TaskType.RETRIEVAL_DOCUMENT,
198206
});
199207

@@ -207,7 +215,7 @@ export const getDataRetriever = async (
207215
config.embeddingModel ??
208216
new GoogleGenerativeAIEmbeddings({
209217
apiKey: process.env.GOOGLE_GENAI_API_KEY,
210-
model: GOOGLE_GENAI_EMBEDDING_MODELS["text-embedding-004"].name,
218+
model: GOOGLE_GENAI_EMBEDDING_MODELS['text-embedding-004'].name,
211219
taskType: TaskType.RETRIEVAL_QUERY,
212220
});
213221

0 commit comments

Comments
 (0)