Refactor sentence splitting and replace Map() with lru-cache for embe…

…dding cache
jparkerweb · Dec 15, 2024 · 000401a · 000401a
1 parent 77b4a07
commit 000401a
Show file tree

Hide file tree

Showing 4 changed files with 516 additions and 36 deletions.
diff --git a/chunkit.js b/chunkit.js
@@ -8,7 +8,7 @@
 // == github repo: https://github.com/jparkerweb/semantic-chunking ==
 // ==================================================================
 
-import { splitBySentence } from "string-segmenter"
+import sentencize from '@stdlib/nlp-sentencize';
 import { DEFAULT_CONFIG } from './config.js';
 import { initializeEmbeddingUtils, tokenizer, createEmbedding } from './embeddingUtils.js';
 import { computeAdvancedSimilarities, adjustThreshold } from './similarityUtils.js';
@@ -77,10 +77,7 @@ export async function chunkit(
         doc.document_text = normalizedText;
 
         // Split the text into sentences
-        const sentences = [];
-        for (const { segment } of splitBySentence(doc.document_text)) {
-            sentences.push(segment.trim());
-        }
+        const sentences = sentencize(doc.document_text);
 
         // Compute similarities and create chunks
         const { similarities, average, variance } = await computeAdvancedSimilarities(
@@ -220,10 +217,7 @@ export async function cramit(
         }
 
         // Split the text into sentences
-        const sentences = [];
-        for (const { segment } of splitBySentence(doc.document_text)) {
-            sentences.push(segment.trim());
-        }
+        const sentences = sentencize(doc.document_text);
 
         // Create chunks without considering similarities
         const chunks = createChunks(sentences, null, maxTokenSize, 0, logging);
@@ -331,12 +325,7 @@ export async function sentenceit(
         }
 
         // Split the text into sentences
-        const chunks = [];
-        for (const { segment } of splitBySentence(doc.document_text)) {
-            if (segment.trim().length > 0) {
-                chunks.push(segment.trim());
-            }
-        }
+        const chunks = sentencize(doc.document_text);
 
         if (logging) {
             console.log('\nSENTENCEIT');

diff --git a/embeddingUtils.js b/embeddingUtils.js
@@ -1,8 +1,16 @@
 import { env, pipeline, AutoTokenizer } from '@huggingface/transformers';
+import { LRUCache } from 'lru-cache';
 
 let tokenizer;
 let generateEmbedding;
-const embeddingCache = new Map();
+const embeddingCache = new LRUCache({
+    max: 500,
+    maxSize: 50_000_000,
+    sizeCalculation: (value, key) => {
+        return (value.length * 4) + key.length;
+    },
+    ttl: 1000 * 60 * 60,
+});
 
 // --------------------------------------------
 // -- Initialize embedding model and tokenizer --
@@ -35,8 +43,9 @@ export async function initializeEmbeddingUtils(
 // -- Function to generate embeddings --
 // -------------------------------------
 export async function createEmbedding(text) {
-    if (embeddingCache.has(text)) {
-        return embeddingCache.get(text);
+    const cached = embeddingCache.get(text);
+    if (cached) {
+        return cached;
     }
 
     const embeddings = await generateEmbedding(text, { pooling: 'mean', normalize: true });