diff --git a/chunkit.js b/chunkit.js index 04c0483..5fa20f6 100644 --- a/chunkit.js +++ b/chunkit.js @@ -8,7 +8,7 @@ // == github repo: https://github.com/jparkerweb/semantic-chunking == // ================================================================== -import { splitBySentence } from "string-segmenter" +import sentencize from '@stdlib/nlp-sentencize'; import { DEFAULT_CONFIG } from './config.js'; import { initializeEmbeddingUtils, tokenizer, createEmbedding } from './embeddingUtils.js'; import { computeAdvancedSimilarities, adjustThreshold } from './similarityUtils.js'; @@ -77,10 +77,7 @@ export async function chunkit( doc.document_text = normalizedText; // Split the text into sentences - const sentences = []; - for (const { segment } of splitBySentence(doc.document_text)) { - sentences.push(segment.trim()); - } + const sentences = sentencize(doc.document_text); // Compute similarities and create chunks const { similarities, average, variance } = await computeAdvancedSimilarities( @@ -220,10 +217,7 @@ export async function cramit( } // Split the text into sentences - const sentences = []; - for (const { segment } of splitBySentence(doc.document_text)) { - sentences.push(segment.trim()); - } + const sentences = sentencize(doc.document_text); // Create chunks without considering similarities const chunks = createChunks(sentences, null, maxTokenSize, 0, logging); @@ -331,12 +325,7 @@ export async function sentenceit( } // Split the text into sentences - const chunks = []; - for (const { segment } of splitBySentence(doc.document_text)) { - if (segment.trim().length > 0) { - chunks.push(segment.trim()); - } - } + const chunks = sentencize(doc.document_text); if (logging) { console.log('\nSENTENCEIT'); diff --git a/embeddingUtils.js b/embeddingUtils.js index 8322d20..aa93d9c 100644 --- a/embeddingUtils.js +++ b/embeddingUtils.js @@ -1,8 +1,16 @@ import { env, pipeline, AutoTokenizer } from '@huggingface/transformers'; +import { LRUCache } from 'lru-cache'; let tokenizer; let generateEmbedding; -const embeddingCache = new Map(); +const embeddingCache = new LRUCache({ + max: 500, + maxSize: 50_000_000, + sizeCalculation: (value, key) => { + return (value.length * 4) + key.length; + }, + ttl: 1000 * 60 * 60, +}); // -------------------------------------------- // -- Initialize embedding model and tokenizer -- @@ -35,8 +43,9 @@ export async function initializeEmbeddingUtils( // -- Function to generate embeddings -- // ------------------------------------- export async function createEmbedding(text) { - if (embeddingCache.has(text)) { - return embeddingCache.get(text); + const cached = embeddingCache.get(text); + if (cached) { + return cached; } const embeddings = await generateEmbedding(text, { pooling: 'mean', normalize: true }); diff --git a/package-lock.json b/package-lock.json index 22a477a..8b25983 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,18 +1,19 @@ { "name": "semantic-chunking", - "version": "2.4.0", + "version": "2.4.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "semantic-chunking", - "version": "2.4.0", + "version": "2.4.1", "license": "ISC", "dependencies": { - "@huggingface/transformers": "^3.1.2", + "@huggingface/transformers": "^3.2.0", + "@stdlib/nlp-sentencize": "^0.2.2", "cli-progress": "^3.12.0", "fs": "^0.0.1-security", - "string-segmenter": "^1.2.0" + "lru-cache": "^11.0.2" } }, "node_modules/@emnapi/runtime": { @@ -33,9 +34,9 @@ } }, "node_modules/@huggingface/transformers": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@huggingface/transformers/-/transformers-3.1.2.tgz", - "integrity": "sha512-+YKHashFDkZbjqTKPyhfRHiIxe1fl5/KqOr3zIHzXmUKhDSH4pwQ2Q58iXT5CUL6iSaZd93x+ep4VmvW6dLAlw==", + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/@huggingface/transformers/-/transformers-3.2.0.tgz", + "integrity": "sha512-1LEvKB/Gyw1xrnLHOCN1lPZNpIHGdoG3PitneTLNlIYYJAnVX9/2Sjs+Jt+YnqSUOOqfwPJ39eMclUZ8P186xA==", "dependencies": { "@huggingface/jinja": "^0.3.2", "onnxruntime-node": "1.20.1", @@ -521,6 +522,483 @@ "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==" }, + "node_modules/@stdlib/assert-has-own-property": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/assert-has-own-property/-/assert-has-own-property-0.2.2.tgz", + "integrity": "sha512-m5rV4Z2/iNkwx2vRsNheM6sQZMzc8rQQOo90LieICXovXZy8wA5jNld4kRKjMNcRt/TjrNP7i2Rhh8hruRDlHg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/assert-has-symbol-support": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/assert-has-symbol-support/-/assert-has-symbol-support-0.2.2.tgz", + "integrity": "sha512-vCsGGmDZz5dikGgdF26rIL0y0nHvH7qaVf89HLLTybceuZijAqFSJEqcB3Gpl5uaeueLNAWExHi2EkoUVqKHGg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/assert-has-tostringtag-support": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/assert-has-tostringtag-support/-/assert-has-tostringtag-support-0.2.2.tgz", + "integrity": "sha512-bSHGqku11VH0swPEzO4Y2Dr+lTYEtjSWjamwqCTC8udOiOIOHKoxuU4uaMGKJjVfXG1L+XefLHqzuO5azxdRaA==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-has-symbol-support": "^0.2.1" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/assert-is-boolean": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/assert-is-boolean/-/assert-is-boolean-0.2.2.tgz", + "integrity": "sha512-3KFLRTYZpX6u95baZ6PubBvjehJs2xBU6+zrenR0jx8KToUYCnJPxqqj7JXRhSD+cOURmcjj9rocVaG9Nz18Pg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-has-tostringtag-support": "^0.2.2", + "@stdlib/boolean-ctor": "^0.2.2", + "@stdlib/utils-define-nonenumerable-read-only-property": "^0.2.2", + "@stdlib/utils-native-class": "^0.2.1" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/assert-is-string": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/assert-is-string/-/assert-is-string-0.2.2.tgz", + "integrity": "sha512-SOkFg4Hq443hkadM4tzcwTHWvTyKP9ULOZ8MSnnqmU0nBX1zLVFLFGY8jnF6Cary0dL0V7QQBCfuxqKFM6u2PQ==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-has-tostringtag-support": "^0.2.2", + "@stdlib/utils-define-nonenumerable-read-only-property": "^0.2.2", + "@stdlib/utils-native-class": "^0.2.1" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/boolean-ctor": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/boolean-ctor/-/boolean-ctor-0.2.2.tgz", + "integrity": "sha512-qIkHzmfxDvGzQ3XI9R7sZG97QSaWG5TvWVlrvcysOGT1cs6HtQgnf4D//SRzZ52VLm8oICP+6OKtd8Hpm6G7Ww==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/error-tools-fmtprodmsg": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/error-tools-fmtprodmsg/-/error-tools-fmtprodmsg-0.2.2.tgz", + "integrity": "sha512-2IliQfTes4WV5odPidZFGD5eYDswZrPXob7oOu95Q69ERqImo8WzSwnG2EDbHPyOyYCewuMfM5Ha6Ggf+u944Q==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/nlp-sentencize": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/nlp-sentencize/-/nlp-sentencize-0.2.2.tgz", + "integrity": "sha512-dWDkcyPoLNvuQA59OxGHorcEQ79Rk6ghJPGwuS8kt9c3Rp0YjSE1cs/+2C9crm8uQXRJr6dcp25hLGLrSiqgsw==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-is-string": "^0.2.1", + "@stdlib/nlp-tokenize": "^0.2.1", + "@stdlib/string-base-trim": "^0.2.2" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/nlp-tokenize": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/nlp-tokenize/-/nlp-tokenize-0.2.2.tgz", + "integrity": "sha512-qoKCiOHrV/PuDQpgxY69ROyJcA+uxNORR6JWlLPL4K53rweCGwLf+htDXXuhRcTx/g7mPKNWqoBlGm0WHd8WWg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-has-own-property": "^0.2.1", + "@stdlib/assert-is-boolean": "^0.2.1", + "@stdlib/assert-is-string": "^0.2.1", + "@stdlib/error-tools-fmtprodmsg": "^0.2.2", + "@stdlib/string-format": "^0.2.2" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/string-base-format-interpolate": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/string-base-format-interpolate/-/string-base-format-interpolate-0.2.2.tgz", + "integrity": "sha512-i9nU9rAB2+o/RR66TS9iQ8x+YzeUDL1SGiAo6GY3hP6Umz5Dx9Qp/v8T69gWVsb4a1YSclz5+YeCWaFgwvPjKA==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/string-base-format-tokenize": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/string-base-format-tokenize/-/string-base-format-tokenize-0.2.2.tgz", + "integrity": "sha512-kXq2015i+LJjqth5dN+hYnvJXBSzRm8w0ABWB5tYAsIuQTpQK+mSo2muM8JBEFEnqUHAwpUsu2qNTK/9o8lsJg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/string-base-replace": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/string-base-replace/-/string-base-replace-0.2.2.tgz", + "integrity": "sha512-Y4jZwRV4Uertw7AlA/lwaYl1HjTefSriN5+ztRcQQyDYmoVN3gzoVKLJ123HPiggZ89vROfC+sk/6AKvly+0CA==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/string-base-trim": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/string-base-trim/-/string-base-trim-0.2.2.tgz", + "integrity": "sha512-1BVoM1XYxPbZ7xQcf4+TNP4dya5KphlkRnj0e0FBrmlYacJITvPgmsywOREivHqF8osH1oQQBZ47J/J/hQ79Eg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/string-base-replace": "^0.2.1" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/string-format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/string-format/-/string-format-0.2.2.tgz", + "integrity": "sha512-GUa50uxgMAtoItsxTbMmwkyhIwrCxCrsjzk3nAbLnt/1Kt1EWOWMwsALqZdD6K4V/xSJ4ns6PZur3W6w+vKk9g==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/string-base-format-interpolate": "^0.2.1", + "@stdlib/string-base-format-tokenize": "^0.2.2" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/symbol-ctor": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/symbol-ctor/-/symbol-ctor-0.2.2.tgz", + "integrity": "sha512-XsmiTfHnTb9jSPf2SoK3O0wrNOXMxqzukvDvtzVur1XBKfim9+seaAS4akmV1H3+AroAXQWVtde885e1B6jz1w==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/utils-define-nonenumerable-read-only-property": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/utils-define-nonenumerable-read-only-property/-/utils-define-nonenumerable-read-only-property-0.2.2.tgz", + "integrity": "sha512-V3mpAesJemLYDKG376CsmoczWPE/4LKsp8xBvUxCt5CLNAx3J/1W39iZQyA5q6nY1RStGinGn1/dYZwa8ig0Uw==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/utils-define-property": "^0.2.3" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/utils-define-property": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/@stdlib/utils-define-property/-/utils-define-property-0.2.4.tgz", + "integrity": "sha512-XlMdz7xwuw/sqXc9LbsV8XunCzZXjbZPC+OAdf4t4PBw4ZRwGzlTI6WED+f4PYR5Tp9F1cHgLPyMYCIBfA2zRg==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/error-tools-fmtprodmsg": "^0.2.1", + "@stdlib/string-format": "^0.2.1" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, + "node_modules/@stdlib/utils-native-class": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@stdlib/utils-native-class/-/utils-native-class-0.2.2.tgz", + "integrity": "sha512-cSn/FozbEpfR/FlJAoceQtZ8yUJFhZ8RFkbEsxW/7+H4o09yln3lBS0HSfUJISYNtpTNN/2/Fup88vmvwspvwA==", + "os": [ + "aix", + "darwin", + "freebsd", + "linux", + "macos", + "openbsd", + "sunos", + "win32", + "windows" + ], + "dependencies": { + "@stdlib/assert-has-own-property": "^0.2.1", + "@stdlib/assert-has-tostringtag-support": "^0.2.2", + "@stdlib/symbol-ctor": "^0.2.2" + }, + "engines": { + "node": ">=0.10.0", + "npm": ">2.7.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/stdlib" + } + }, "node_modules/@types/node": { "version": "22.10.2", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.10.2.tgz", @@ -735,9 +1213,12 @@ "integrity": "sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==" }, "node_modules/lru-cache": { - "version": "10.4.3", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", - "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==" + "version": "11.0.2", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.0.2.tgz", + "integrity": "sha512-123qHRfJBmo2jXDbo/a5YOQrJoHF/GNQTLzQ5+IdK5pWpceK17yRc6ozlWd25FxvGKQbIUs91fDFkXmDHTKcyA==", + "engines": { + "node": "20 || >=22" + } }, "node_modules/minimatch": { "version": "9.0.5", @@ -853,6 +1334,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/path-scurry/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==" + }, "node_modules/platform": { "version": "1.3.6", "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz", @@ -982,11 +1468,6 @@ "is-arrayish": "^0.3.1" } }, - "node_modules/string-segmenter": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/string-segmenter/-/string-segmenter-1.2.0.tgz", - "integrity": "sha512-xJoXUcoQaMLs3vyFbdHadegLg1apJb7aPivFhxbH88y/1vJvRiKs+tTdB3QvlYau7whEZqXpWBkhKV8llFx03g==" - }, "node_modules/string-width": { "version": "4.2.3", "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", diff --git a/package.json b/package.json index 81890c3..663806e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "semantic-chunking", - "version": "2.4.0", + "version": "2.4.1", "description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).", "homepage": "https://www.equilllabs.com/projects/semantic-chunking", "repository": { @@ -41,9 +41,10 @@ "download-models": "node ./tools/download-models.js" }, "dependencies": { - "@huggingface/transformers": "^3.1.2", + "@huggingface/transformers": "^3.2.0", + "@stdlib/nlp-sentencize": "^0.2.2", "cli-progress": "^3.12.0", "fs": "^0.0.1-security", - "string-segmenter": "^1.2.0" + "lru-cache": "^11.0.2" } }