Skip to content

Commit

Permalink
Refactor sentence splitting and replace Map() with lru-cache for embe…
Browse files Browse the repository at this point in the history
…dding cache
  • Loading branch information
jparkerweb committed Dec 15, 2024
1 parent 77b4a07 commit 000401a
Show file tree
Hide file tree
Showing 4 changed files with 516 additions and 36 deletions.
19 changes: 4 additions & 15 deletions chunkit.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// == github repo: https://github.com/jparkerweb/semantic-chunking ==
// ==================================================================

import { splitBySentence } from "string-segmenter"
import sentencize from '@stdlib/nlp-sentencize';
import { DEFAULT_CONFIG } from './config.js';
import { initializeEmbeddingUtils, tokenizer, createEmbedding } from './embeddingUtils.js';
import { computeAdvancedSimilarities, adjustThreshold } from './similarityUtils.js';
Expand Down Expand Up @@ -77,10 +77,7 @@ export async function chunkit(
doc.document_text = normalizedText;

// Split the text into sentences
const sentences = [];
for (const { segment } of splitBySentence(doc.document_text)) {
sentences.push(segment.trim());
}
const sentences = sentencize(doc.document_text);

// Compute similarities and create chunks
const { similarities, average, variance } = await computeAdvancedSimilarities(
Expand Down Expand Up @@ -220,10 +217,7 @@ export async function cramit(
}

// Split the text into sentences
const sentences = [];
for (const { segment } of splitBySentence(doc.document_text)) {
sentences.push(segment.trim());
}
const sentences = sentencize(doc.document_text);

// Create chunks without considering similarities
const chunks = createChunks(sentences, null, maxTokenSize, 0, logging);
Expand Down Expand Up @@ -331,12 +325,7 @@ export async function sentenceit(
}

// Split the text into sentences
const chunks = [];
for (const { segment } of splitBySentence(doc.document_text)) {
if (segment.trim().length > 0) {
chunks.push(segment.trim());
}
}
const chunks = sentencize(doc.document_text);

if (logging) {
console.log('\nSENTENCEIT');
Expand Down
15 changes: 12 additions & 3 deletions embeddingUtils.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import { env, pipeline, AutoTokenizer } from '@huggingface/transformers';
import { LRUCache } from 'lru-cache';

let tokenizer;
let generateEmbedding;
const embeddingCache = new Map();
const embeddingCache = new LRUCache({
max: 500,
maxSize: 50_000_000,
sizeCalculation: (value, key) => {
return (value.length * 4) + key.length;
},
ttl: 1000 * 60 * 60,
});

// --------------------------------------------
// -- Initialize embedding model and tokenizer --
Expand Down Expand Up @@ -35,8 +43,9 @@ export async function initializeEmbeddingUtils(
// -- Function to generate embeddings --
// -------------------------------------
export async function createEmbedding(text) {
if (embeddingCache.has(text)) {
return embeddingCache.get(text);
const cached = embeddingCache.get(text);
if (cached) {
return cached;
}

const embeddings = await generateEmbedding(text, { pooling: 'mean', normalize: true });
Expand Down
Loading

0 comments on commit 000401a

Please sign in to comment.