1
- import { MemoryVectorStore } from " langchain/vectorstores/memory" ;
2
- import { GoogleGenerativeAIEmbeddings } from " @langchain/google-genai" ;
3
- import { formatDocumentsAsString } from " langchain/util/document" ;
4
- import { Runnable , RunnableConfig } from " @langchain/core/runnables" ;
5
- import { EmbeddingsInterface } from " @langchain/core/embeddings" ;
1
+ import { MemoryVectorStore } from ' langchain/vectorstores/memory' ;
2
+ import { GoogleGenerativeAIEmbeddings } from ' @langchain/google-genai' ;
3
+ import { formatDocumentsAsString } from ' langchain/util/document' ;
4
+ import { Runnable , RunnableConfig } from ' @langchain/core/runnables' ;
5
+ import { EmbeddingsInterface } from ' @langchain/core/embeddings' ;
6
6
import {
7
7
VectorStore ,
8
8
VectorStoreRetrieverInput ,
9
- } from "@langchain/core/vectorstores" ;
9
+ } from '@langchain/core/vectorstores' ;
10
+ import { Document } from 'langchain/document' ;
10
11
import {
11
12
CSVLoaderOptions ,
12
13
JSONLoaderKeysToInclude ,
13
14
PDFLoaderOptions ,
14
15
SupportedDataLoaderTypes ,
15
16
getDocs ,
16
- } from " ../data-loaders/data-loaders" ;
17
+ } from ' ../data-loaders/data-loaders' ;
17
18
import {
18
19
ChunkingConfig ,
19
20
DataSplitterConfig ,
20
21
SupportedDataSplitterTypes ,
21
22
runDataSplitter ,
22
- } from " ../data-splitters/data-splitters" ;
23
- import { GOOGLE_GENAI_EMBEDDING_MODELS } from " ../data-embeddings/embedding-models" ;
24
- import { getEnvironmentVariable } from " ../../utils/utils" ;
23
+ } from ' ../data-splitters/data-splitters' ;
24
+ import { GOOGLE_GENAI_EMBEDDING_MODELS } from ' ../data-embeddings/embedding-models' ;
25
+ import { getEnvironmentVariable } from ' ../../utils/utils' ;
25
26
26
27
/**
27
28
* Type denoting a retriever that retrieves text data.
@@ -41,6 +42,8 @@ export type RetrievalOptions =
41
42
* Represents the configuration for the retriever when generating embeddings.
42
43
* @property {SupportedDataLoaderTypes } dataType - The type of data loader to use.
43
44
* @property {string } filePath - The path to the file containing the data.
45
+ * @property {Document<Record<string, string>>[] } [docs] - Optional: Provide an array containing LangChain document objects for the data.
46
+ * @property {Document<Record<string, unknown>>[] } [splitDocs] - Optional: Provide an array containing LangChain document objects for the split data.
44
47
* @property {JSONLoaderKeysToInclude } [jsonLoaderKeysToInclude] - The keys to include when loading JSON data.
45
48
* @property {CSVLoaderOptions } [csvLoaderOptions] - The options for loading CSV data.
46
49
* @property {PDFLoaderOptions } [pdfLoaderOptions] - The options for loading PDF data.
@@ -55,6 +58,8 @@ export type RetrievalOptions =
55
58
export type RetrieverConfigGeneratingEmbeddings = {
56
59
dataType : SupportedDataLoaderTypes ;
57
60
filePath : string ;
61
+ docs ?: Document < Record < string , string > > [ ] ;
62
+ splitDocs ?: Document < Record < string , unknown > > [ ] ;
58
63
jsonLoaderKeysToInclude ?: JSONLoaderKeysToInclude ;
59
64
csvLoaderOptions ?: CSVLoaderOptions ;
60
65
pdfLoaderOptions ?: PDFLoaderOptions ;
@@ -91,12 +96,12 @@ export type RetrieverConfig =
91
96
* Task type for embedding content.
92
97
*/
93
98
export enum TaskType {
94
- TASK_TYPE_UNSPECIFIED = " TASK_TYPE_UNSPECIFIED" ,
95
- RETRIEVAL_QUERY = " RETRIEVAL_QUERY" ,
96
- RETRIEVAL_DOCUMENT = " RETRIEVAL_DOCUMENT" ,
97
- SEMANTIC_SIMILARITY = " SEMANTIC_SIMILARITY" ,
98
- CLASSIFICATION = " CLASSIFICATION" ,
99
- CLUSTERING = " CLUSTERING" ,
99
+ TASK_TYPE_UNSPECIFIED = ' TASK_TYPE_UNSPECIFIED' ,
100
+ RETRIEVAL_QUERY = ' RETRIEVAL_QUERY' ,
101
+ RETRIEVAL_DOCUMENT = ' RETRIEVAL_DOCUMENT' ,
102
+ SEMANTIC_SIMILARITY = ' SEMANTIC_SIMILARITY' ,
103
+ CLASSIFICATION = ' CLASSIFICATION' ,
104
+ CLUSTERING = ' CLUSTERING' ,
100
105
}
101
106
102
107
/**
@@ -113,19 +118,19 @@ export const getAppropriateDataSplitter = (
113
118
defaultSplitterConfig ?: DataSplitterConfig ;
114
119
} => {
115
120
switch ( dataType ) {
116
- case " csv" :
117
- case " json" :
121
+ case ' csv' :
122
+ case ' json' :
118
123
return {
119
- defaultDataSplitterType : " character" ,
124
+ defaultDataSplitterType : ' character' ,
120
125
defaultSplitterConfig : {
121
126
textSplitterConfig : {
122
- separators : [ "\n" ] ,
127
+ separators : [ '\n' ] ,
123
128
} ,
124
129
} ,
125
130
} ;
126
131
default :
127
132
return {
128
- defaultDataSplitterType : " text" ,
133
+ defaultDataSplitterType : ' text' ,
129
134
} ;
130
135
}
131
136
} ;
@@ -152,7 +157,7 @@ export const getDataRetriever = async (
152
157
// vector store must be provided
153
158
if ( ! config . vectorStore )
154
159
throw new Error (
155
- " Vector store must be provided when not generating embeddings"
160
+ ' Vector store must be provided when not generating embeddings'
156
161
) ;
157
162
// return retriever
158
163
else
@@ -164,36 +169,39 @@ export const getDataRetriever = async (
164
169
// if generating embeddings, data type must be provided
165
170
if ( ! config . dataType ) {
166
171
throw new Error (
167
- " Data type and file path must be provided when generating embeddings"
172
+ ' Data type and file path must be provided when generating embeddings'
168
173
) ;
169
174
}
170
175
171
176
// if generating embeddings, file path must be provided
172
- if ( ! config . filePath || config . filePath === "" ) {
173
- throw new Error ( " Invalid file path. File path must be provided" ) ;
177
+ if ( ! config . filePath || config . filePath === '' ) {
178
+ throw new Error ( ' Invalid file path. File path must be provided' ) ;
174
179
}
175
180
176
181
try {
177
182
// Retrieve the documents from the specified file path
178
- const docs = await getDocs ( config . dataType , config . filePath ) ;
183
+ const docs : Document < Record < string , string > > [ ] =
184
+ config . docs ?? ( await getDocs ( config . dataType , config . filePath ) ) ;
179
185
180
- const { defaultDataSplitterType, defaultSplitterConfig } =
186
+ const { defaultDataSplitterType, defaultSplitterConfig} =
181
187
getAppropriateDataSplitter ( config . dataType ) ;
182
188
183
189
// Split the retrieved documents into chunks using the data splitter
184
- const splitDocs = await runDataSplitter ( {
185
- docs,
186
- dataSplitterType : config . dataSplitterType ?? defaultDataSplitterType ,
187
- chunkingConfig : config . chunkingConfig ?? defaultChunkingConfig ,
188
- splitterConfig : config . splitterConfig ?? defaultSplitterConfig ,
189
- } ) ;
190
+ const splitDocs : Document < Record < string , unknown > > [ ] =
191
+ config . splitDocs ??
192
+ ( await runDataSplitter ( {
193
+ docs,
194
+ dataSplitterType : config . dataSplitterType ?? defaultDataSplitterType ,
195
+ chunkingConfig : config . chunkingConfig ?? defaultChunkingConfig ,
196
+ splitterConfig : config . splitterConfig ?? defaultSplitterConfig ,
197
+ } ) ) ;
190
198
191
199
// embedding model - if not provided, use the default Google Generative AI Embeddings model
192
200
const embeddings : EmbeddingsInterface =
193
201
config . embeddingModel ??
194
202
new GoogleGenerativeAIEmbeddings ( {
195
- apiKey : getEnvironmentVariable ( " GOOGLE_GENAI_API_KEY" ) ,
196
- model : GOOGLE_GENAI_EMBEDDING_MODELS [ " text-embedding-004" ] . name ,
203
+ apiKey : getEnvironmentVariable ( ' GOOGLE_GENAI_API_KEY' ) ,
204
+ model : GOOGLE_GENAI_EMBEDDING_MODELS [ ' text-embedding-004' ] . name ,
197
205
taskType : TaskType . RETRIEVAL_DOCUMENT ,
198
206
} ) ;
199
207
@@ -207,7 +215,7 @@ export const getDataRetriever = async (
207
215
config . embeddingModel ??
208
216
new GoogleGenerativeAIEmbeddings ( {
209
217
apiKey : process . env . GOOGLE_GENAI_API_KEY ,
210
- model : GOOGLE_GENAI_EMBEDDING_MODELS [ " text-embedding-004" ] . name ,
218
+ model : GOOGLE_GENAI_EMBEDDING_MODELS [ ' text-embedding-004' ] . name ,
211
219
taskType : TaskType . RETRIEVAL_QUERY ,
212
220
} ) ;
213
221
0 commit comments