fix: Reduce the cost of regexp exclusions

Fix #1775
streetsidesoftware · Sep 28, 2021 · a252ffe · a252ffe
1 parent 3f9e8c6
commit a252ffe
Show file tree

Hide file tree

Showing 7 changed files with 10,042 additions and 125 deletions.
diff --git a/packages/cspell-lib/src/spellCheckFile.test.ts b/packages/cspell-lib/src/spellCheckFile.test.ts
@@ -14,6 +14,7 @@ import {
 } from './spellCheckFile';
 
 const samples = Path.resolve(__dirname, '../samples');
+const testFixtures = Path.resolve(__dirname, '../../../test-fixtures');
 const isWindows = process.platform === 'win32';
 const hasDriveLetter = /^[A-Z]:\\/;
 
@@ -175,6 +176,16 @@ describe('Validate Spell Checking Documents', () => {
             expect(r).toEqual(oc(expected));
         }
     );
+    test.each`
+        uri                                              | text  | settings | options | expected
+        ${tf('issues/issue-1775/hunspell/utf_info.hxx')} | ${''} | ${{}}    | ${{}}   | ${{ checked: true, errors: undefined }}
+    `(
+        'spellCheckFile fixtures $uri $settings $options',
+        async ({ uri, text, settings, options, expected }: TestSpellCheckFile) => {
+            const r = await spellCheckDocument(d(uri, text || undefined), options, settings);
+            expect(r).toEqual(oc(expected));
+        }
+    );
 });
 
 describe('Validate Uri assumptions', () => {
@@ -244,3 +255,7 @@ function s(file: string) {
     // Force lowercase drive letter if windows
     return isWindows ? fixDriveLetter(p) : p;
 }
+
+function tf(file: string) {
+    return Path.resolve(testFixtures, file);
+}
diff --git a/packages/cspell-lib/src/textValidator.ts b/packages/cspell-lib/src/textValidator.ts
@@ -257,21 +257,31 @@ function mapTextOffsetsAgainstRanges(includeRanges: TextRange.MatchRange[]): (wo
             rangePos -= 1;
         }
 
+        const cur = includeRanges[rangePos];
+        if (wordEndPos <= cur.endPos && wordStartPos >= cur.startPos) {
+            return [textOffset];
+        }
+
         while (wordStartPos < wordEndPos) {
             while (includeRanges[rangePos] && includeRanges[rangePos].endPos <= wordStartPos) {
                 rangePos += 1;
             }
-            if (!includeRanges[rangePos] || wordEndPos < includeRanges[rangePos].startPos) {
+            if (!includeRanges[rangePos]) {
                 break;
             }
             const { startPos, endPos } = includeRanges[rangePos];
+            if (wordEndPos < startPos) {
+                break;
+            }
             const a = Math.max(wordStartPos, startPos);
             const b = Math.min(wordEndPos, endPos);
-            parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
+            if (a !== b) {
+                parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
+            }
             wordStartPos = b;
         }
 
-        return parts.filter((wo) => !!wo.text);
+        return parts;
     };
 
     return mapper;

diff --git a/packages/cspell-lib/src/util/TextRange.test.ts b/packages/cspell-lib/src/util/TextRange.test.ts
@@ -1,33 +1,18 @@
 import * as TextRange from './TextRange';
 
+const { makeSortedMatchRangeArray } = TextRange.__testing__;
+
 describe('Util Text', () => {
-    test('tests unionRanges', () => {
-        const result1 = TextRange.unionRanges([]);
-        expect(result1).toEqual([]);
-        const result2 = TextRange.unionRanges([{ startPos: 0, endPos: 10 }]);
-        expect(result2).toEqual([{ startPos: 0, endPos: 10 }]);
-        const result3 = TextRange.unionRanges([
-            { startPos: 0, endPos: 10 },
-            { startPos: 0, endPos: 10 },
-        ]);
-        expect(result3).toEqual([{ startPos: 0, endPos: 10 }]);
-        const result4 = TextRange.unionRanges([
-            { startPos: 5, endPos: 15 },
-            { startPos: 0, endPos: 10 },
-        ]);
-        expect(result4).toEqual([{ startPos: 0, endPos: 15 }]);
-        const result5 = TextRange.unionRanges([
-            { startPos: 11, endPos: 15 },
-            { startPos: 0, endPos: 10 },
-        ]);
-        expect(result5).toEqual([
-            { startPos: 0, endPos: 10 },
-            { startPos: 11, endPos: 15 },
-        ]);
-        const result6 = TextRange.unionRanges([
-            { startPos: 10, endPos: 15 },
-            { startPos: 0, endPos: 10 },
-        ]);
-        expect(result6).toEqual([{ startPos: 0, endPos: 15 }]);
+    test.each`
+        ranges                                                         | expected
+        ${[]}                                                          | ${[]}
+        ${[{ startPos: 0, endPos: 10 }]}                               | ${[{ startPos: 0, endPos: 10 }]}
+        ${[{ startPos: 0, endPos: 10 }, { startPos: 0, endPos: 10 }]}  | ${[{ startPos: 0, endPos: 10 }]}
+        ${[{ startPos: 5, endPos: 15 }, { startPos: 0, endPos: 10 }]}  | ${[{ startPos: 0, endPos: 15 }]}
+        ${[{ startPos: 11, endPos: 15 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 10 }, { startPos: 11, endPos: 15 }]}
+        ${[{ startPos: 10, endPos: 15 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 15 }]}
+    `('unionRanges $ranges', ({ ranges, expected }) => {
+        const r = TextRange.unionRanges(ranges);
+        expect(r).toEqual(makeSortedMatchRangeArray(expected));
     });
 });
diff --git a/packages/cspell-lib/src/util/TextRange.ts b/packages/cspell-lib/src/util/TextRange.ts
@@ -14,139 +14,111 @@ export interface MatchRangeOptionalText extends MatchRange {
     text?: string;
 }
 
+function toMatchRangeWithText(m: RegExpMatchArray): MatchRangeWithText {
+    const index = m.index || 0;
+    const _text = m[0];
+    return {
+        startPos: index,
+        endPos: index + _text.length,
+        text: _text,
+    };
+}
+
 export function findMatchingRanges(pattern: string | RegExp, text: string): MatchRangeOptionalText[] {
     if (pattern === '.*') {
         return [{ startPos: 0, endPos: text.length }];
     }
-    const ranges: MatchRangeWithText[] = [];
 
     try {
         const regex = pattern instanceof RegExp ? new RegExp(pattern) : Text.stringToRegExp(pattern, 'gim', 'g');
         if (regex) {
-            for (const found of GS.sequenceFromRegExpMatch(regex, text)) {
-                ranges.push({
-                    startPos: found.index,
-                    endPos: found.index + found[0].length,
-                    text: found[0],
-                });
-                if (!regex.global) {
-                    break;
-                }
+            if (!regex.global) {
+                const m = text.match(regex);
+                if (!m) return [];
+                return [toMatchRangeWithText(m)];
             }
+            return [...text.matchAll(regex)].map(toMatchRangeWithText);
         }
     } catch (e) {
         // ignore any malformed regexp from the user.
         // console.log(e.message);
     }
 
-    return ranges;
+    return [];
 }
 
-function fnSortRanges(a: MatchRange, b: MatchRange) {
+function compareRanges(a: MatchRange, b: MatchRange) {
     return a.startPos - b.startPos || a.endPos - b.endPos;
 }
 
 export function unionRanges(ranges: MatchRange[]): MatchRange[] {
-    const sortedRanges = ranges.sort(fnSortRanges);
-    const result = sortedRanges.slice(1).reduce((acc: MatchRange[], next) => {
-        const last = acc[acc.length - 1];
-        if (next.startPos > last.endPos) {
-            acc.push(next);
-        } else if (next.endPos > last.endPos) {
-            acc[acc.length - 1] = {
-                startPos: last.startPos,
-                endPos: Math.max(last.endPos, next.endPos),
-            };
-        }
-        return acc;
-    }, sortedRanges.slice(0, 1));
-
-    return result;
-}
-
-export function findMatchingRangesForPatterns(patterns: (string | RegExp)[], text: string): MatchRange[] {
-    const matchedPatterns = GS.genSequence(patterns).concatMap((pattern) => findMatchingRanges(pattern, text));
-    return unionRanges(matchedPatterns.toArray());
+    return makeSortedMatchRangeArray([..._unionRanges(ranges)]);
 }
 
-/**
- * Exclude range b from a
- */
-function excludeRange(a: MatchRange, b: MatchRange) {
-    // non-intersection
-    if (b.endPos <= a.startPos || b.startPos >= a.endPos) {
-        return [a];
-    }
+function* _unionRanges(ranges: MatchRange[]): Generator<MatchRange> {
+    const sortedRanges = sortMatchRangeArray(ranges);
 
-    // fully excluded
-    if (b.startPos <= a.startPos && b.endPos >= a.endPos) {
-        return [];
-    }
+    if (!sortedRanges.length) return;
 
-    const result: MatchRange[] = [];
+    let { startPos, endPos } = sortedRanges[0];
 
-    if (a.startPos < b.startPos) {
-        result.push({ startPos: a.startPos, endPos: b.startPos });
+    for (const r of ranges) {
+        if (r.startPos > endPos) {
+            yield { startPos, endPos };
+            startPos = r.startPos;
+            endPos = r.endPos;
+            continue;
+        }
+        endPos = Math.max(endPos, r.endPos);
     }
-
-    if (a.endPos > b.endPos) {
-        result.push({ startPos: b.endPos, endPos: a.endPos });
+    if (startPos < endPos) {
+        yield { startPos, endPos };
     }
-    return result;
+}
+
+export function findMatchingRangesForPatterns(patterns: (string | RegExp)[], text: string): MatchRange[] {
+    const matchedPatterns = GS.genSequence(patterns).concatMap((pattern) => findMatchingRanges(pattern, text));
+    return unionRanges(matchedPatterns.toArray());
 }
 
 /**
  * Create a new set of positions that have the excluded position ranges removed.
  */
 export function excludeRanges(includeRanges: MatchRange[], excludeRanges: MatchRange[]): MatchRange[] {
-    type TInclude = 'i';
-    type TExclude = 'e';
+    return [..._excludeRanges(sortMatchRangeArray(includeRanges), sortMatchRangeArray(excludeRanges))];
+}
 
-    interface MatchRangeWithType extends MatchRange {
-        type: TInclude | TExclude;
-    }
-    interface Result {
-        ranges: MatchRange[];
-        lastExclude?: MatchRange;
+function* _excludeRanges(
+    includeRanges: SortedMatchRangeArray,
+    excludeRanges: SortedMatchRangeArray
+): Generator<MatchRange, undefined, undefined> {
+    if (!includeRanges.length) return;
+    if (!excludeRanges.length) {
+        yield* includeRanges;
+        return;
     }
-    const tInclude: TInclude = 'i';
-    const tExclude: TExclude = 'e';
-
-    const sortedRanges: MatchRangeWithType[] = [
-        ...includeRanges.map((r) => ({ ...r, type: tInclude })),
-        ...excludeRanges.map((r) => ({ ...r, type: tExclude })),
-    ].sort(fnSortRanges);
-
-    const result = sortedRanges.reduce(
-        (acc: Result, range: MatchRangeWithType) => {
-            const { ranges, lastExclude } = acc;
-            const lastInclude = ranges.length ? ranges[ranges.length - 1] : undefined;
-            if (range.type === tExclude) {
-                if (!lastInclude || lastInclude.endPos <= range.startPos) {
-                    // if the exclude is beyond the current include, save it for later
-                    return { ranges, lastExclude: range };
-                }
-                // we need to split the current include.
-                return {
-                    ranges: [...ranges.slice(0, -1), ...excludeRange(ranges[ranges.length - 1], range)],
-                    lastExclude: range,
-                };
-            }
 
-            // The range is an include, we need to check it against the last exclude
-            if (!lastExclude) {
-                return { ranges: ranges.concat([range]) };
+    let exIndex = 0;
+    const limit = excludeRanges.length;
+
+    for (const incRange of includeRanges) {
+        const endPos = incRange.endPos;
+        let startPos = incRange.startPos;
+
+        for (; exIndex < limit; ++exIndex) {
+            const ex = excludeRanges[exIndex];
+            if (ex.startPos >= endPos) break;
+            if (ex.startPos > startPos) {
+                yield { startPos, endPos: ex.startPos };
             }
-            const nextExclude = lastExclude.endPos > range.endPos ? lastExclude : undefined;
-            return {
-                ranges: [...ranges, ...excludeRange(range, lastExclude)],
-                lastExclude: nextExclude,
-            };
-        },
-        { ranges: [] }
-    );
+            startPos = ex.endPos;
+            if (startPos >= endPos) break;
+        }
 
-    return result.ranges;
+        if (startPos < endPos) {
+            yield { startPos, endPos };
+        }
+    }
 }
 
 export function extractRangeText(text: string, ranges: MatchRange[]): MatchRangeWithText[] {
@@ -156,3 +128,30 @@ export function extractRangeText(text: string, ranges: MatchRange[]): MatchRange
         text: text.slice(startPos, endPos),
     }));
 }
+
+const SymSortedMatchRangeArray = Symbol('SortedMatchRangeArray');
+
+interface SortedMatchRangeArray extends Array<MatchRange> {
+    [SymSortedMatchRangeArray]: true;
+}
+
+function sortMatchRangeArray(values: MatchRange[]): SortedMatchRangeArray {
+    if (isSortedMatchRangeArray(values)) return values;
+
+    return makeSortedMatchRangeArray(values.sort(compareRanges));
+}
+
+function isSortedMatchRangeArray(a: MatchRange[] | SortedMatchRangeArray): a is SortedMatchRangeArray {
+    return (<SortedMatchRangeArray>a)[SymSortedMatchRangeArray] === true;
+}
+
+function makeSortedMatchRangeArray(sortedValues: MatchRange[]): SortedMatchRangeArray {
+    const sorted: SortedMatchRangeArray = sortedValues as SortedMatchRangeArray;
+    sorted[SymSortedMatchRangeArray] = true;
+    Object.freeze(sorted);
+    return sorted;
+}
+
+export const __testing__ = {
+    makeSortedMatchRangeArray,
+};
diff --git a/test-fixtures/issues/issue-1775/README.md b/test-fixtures/issues/issue-1775/README.md
@@ -0,0 +1,21 @@
+# Performance bug
+
+[Performance - took to long to spell check file - `utf_info.hxx` · Issue #1775 · streetsidesoftware/cspell](https://github.com/streetsidesoftware/cspell/issues/1775)
+
+## Info
+
+It took nearly 30 seconds to spell check a single file:
+[nuspell/utf_info.hxx at master · nuspell/nuspell](https://github.com/nuspell/nuspell/blob/master/external/hunspell/hunspell/utf_info.hxx)
+
+**Kind of Issue**
+
+- [X] runtime - command-line tools
+
+**Which Tool or library**
+
+- [X] cspell -- the command-line spelling tool
+- [X] cspell-lib -- library that does the actual spell checking.
+
+**Which Version**
+
+Version: 5.10.1
diff --git a/test-fixtures/issues/issue-1775/cspell.json b/test-fixtures/issues/issue-1775/cspell.json
@@ -0,0 +1,5 @@
+{
+    "words": [
+        "nuspell"
+    ]
+}