Skip to content

Commit

Permalink
fix: Reduce the cost of regexp exclusions
Browse files Browse the repository at this point in the history
Fix #1775
  • Loading branch information
Jason3S committed Sep 28, 2021
1 parent 3f9e8c6 commit a252ffe
Show file tree
Hide file tree
Showing 7 changed files with 10,042 additions and 125 deletions.
15 changes: 15 additions & 0 deletions packages/cspell-lib/src/spellCheckFile.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
} from './spellCheckFile';

const samples = Path.resolve(__dirname, '../samples');
const testFixtures = Path.resolve(__dirname, '../../../test-fixtures');
const isWindows = process.platform === 'win32';
const hasDriveLetter = /^[A-Z]:\\/;

Expand Down Expand Up @@ -175,6 +176,16 @@ describe('Validate Spell Checking Documents', () => {
expect(r).toEqual(oc(expected));
}
);
test.each`
uri | text | settings | options | expected
${tf('issues/issue-1775/hunspell/utf_info.hxx')} | ${''} | ${{}} | ${{}} | ${{ checked: true, errors: undefined }}
`(
'spellCheckFile fixtures $uri $settings $options',
async ({ uri, text, settings, options, expected }: TestSpellCheckFile) => {
const r = await spellCheckDocument(d(uri, text || undefined), options, settings);
expect(r).toEqual(oc(expected));
}
);
});

describe('Validate Uri assumptions', () => {
Expand Down Expand Up @@ -244,3 +255,7 @@ function s(file: string) {
// Force lowercase drive letter if windows
return isWindows ? fixDriveLetter(p) : p;
}

function tf(file: string) {
return Path.resolve(testFixtures, file);
}
16 changes: 13 additions & 3 deletions packages/cspell-lib/src/textValidator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -257,21 +257,31 @@ function mapTextOffsetsAgainstRanges(includeRanges: TextRange.MatchRange[]): (wo
rangePos -= 1;
}

const cur = includeRanges[rangePos];
if (wordEndPos <= cur.endPos && wordStartPos >= cur.startPos) {
return [textOffset];
}

while (wordStartPos < wordEndPos) {
while (includeRanges[rangePos] && includeRanges[rangePos].endPos <= wordStartPos) {
rangePos += 1;
}
if (!includeRanges[rangePos] || wordEndPos < includeRanges[rangePos].startPos) {
if (!includeRanges[rangePos]) {
break;
}
const { startPos, endPos } = includeRanges[rangePos];
if (wordEndPos < startPos) {
break;
}
const a = Math.max(wordStartPos, startPos);
const b = Math.min(wordEndPos, endPos);
parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
if (a !== b) {
parts.push({ offset: a, text: text.slice(a - offset, b - offset) });
}
wordStartPos = b;
}

return parts.filter((wo) => !!wo.text);
return parts;
};

return mapper;
Expand Down
41 changes: 13 additions & 28 deletions packages/cspell-lib/src/util/TextRange.test.ts
Original file line number Diff line number Diff line change
@@ -1,33 +1,18 @@
import * as TextRange from './TextRange';

const { makeSortedMatchRangeArray } = TextRange.__testing__;

describe('Util Text', () => {
test('tests unionRanges', () => {
const result1 = TextRange.unionRanges([]);
expect(result1).toEqual([]);
const result2 = TextRange.unionRanges([{ startPos: 0, endPos: 10 }]);
expect(result2).toEqual([{ startPos: 0, endPos: 10 }]);
const result3 = TextRange.unionRanges([
{ startPos: 0, endPos: 10 },
{ startPos: 0, endPos: 10 },
]);
expect(result3).toEqual([{ startPos: 0, endPos: 10 }]);
const result4 = TextRange.unionRanges([
{ startPos: 5, endPos: 15 },
{ startPos: 0, endPos: 10 },
]);
expect(result4).toEqual([{ startPos: 0, endPos: 15 }]);
const result5 = TextRange.unionRanges([
{ startPos: 11, endPos: 15 },
{ startPos: 0, endPos: 10 },
]);
expect(result5).toEqual([
{ startPos: 0, endPos: 10 },
{ startPos: 11, endPos: 15 },
]);
const result6 = TextRange.unionRanges([
{ startPos: 10, endPos: 15 },
{ startPos: 0, endPos: 10 },
]);
expect(result6).toEqual([{ startPos: 0, endPos: 15 }]);
test.each`
ranges | expected
${[]} | ${[]}
${[{ startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 10 }]}
${[{ startPos: 0, endPos: 10 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 10 }]}
${[{ startPos: 5, endPos: 15 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 15 }]}
${[{ startPos: 11, endPos: 15 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 10 }, { startPos: 11, endPos: 15 }]}
${[{ startPos: 10, endPos: 15 }, { startPos: 0, endPos: 10 }]} | ${[{ startPos: 0, endPos: 15 }]}
`('unionRanges $ranges', ({ ranges, expected }) => {
const r = TextRange.unionRanges(ranges);
expect(r).toEqual(makeSortedMatchRangeArray(expected));
});
});
187 changes: 93 additions & 94 deletions packages/cspell-lib/src/util/TextRange.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,139 +14,111 @@ export interface MatchRangeOptionalText extends MatchRange {
text?: string;
}

function toMatchRangeWithText(m: RegExpMatchArray): MatchRangeWithText {
const index = m.index || 0;
const _text = m[0];
return {
startPos: index,
endPos: index + _text.length,
text: _text,
};
}

export function findMatchingRanges(pattern: string | RegExp, text: string): MatchRangeOptionalText[] {
if (pattern === '.*') {
return [{ startPos: 0, endPos: text.length }];
}
const ranges: MatchRangeWithText[] = [];

try {
const regex = pattern instanceof RegExp ? new RegExp(pattern) : Text.stringToRegExp(pattern, 'gim', 'g');
if (regex) {
for (const found of GS.sequenceFromRegExpMatch(regex, text)) {
ranges.push({
startPos: found.index,
endPos: found.index + found[0].length,
text: found[0],
});
if (!regex.global) {
break;
}
if (!regex.global) {
const m = text.match(regex);
if (!m) return [];
return [toMatchRangeWithText(m)];
}
return [...text.matchAll(regex)].map(toMatchRangeWithText);
}
} catch (e) {
// ignore any malformed regexp from the user.
// console.log(e.message);
}

return ranges;
return [];
}

function fnSortRanges(a: MatchRange, b: MatchRange) {
function compareRanges(a: MatchRange, b: MatchRange) {
return a.startPos - b.startPos || a.endPos - b.endPos;
}

export function unionRanges(ranges: MatchRange[]): MatchRange[] {
const sortedRanges = ranges.sort(fnSortRanges);
const result = sortedRanges.slice(1).reduce((acc: MatchRange[], next) => {
const last = acc[acc.length - 1];
if (next.startPos > last.endPos) {
acc.push(next);
} else if (next.endPos > last.endPos) {
acc[acc.length - 1] = {
startPos: last.startPos,
endPos: Math.max(last.endPos, next.endPos),
};
}
return acc;
}, sortedRanges.slice(0, 1));

return result;
}

export function findMatchingRangesForPatterns(patterns: (string | RegExp)[], text: string): MatchRange[] {
const matchedPatterns = GS.genSequence(patterns).concatMap((pattern) => findMatchingRanges(pattern, text));
return unionRanges(matchedPatterns.toArray());
return makeSortedMatchRangeArray([..._unionRanges(ranges)]);
}

/**
* Exclude range b from a
*/
function excludeRange(a: MatchRange, b: MatchRange) {
// non-intersection
if (b.endPos <= a.startPos || b.startPos >= a.endPos) {
return [a];
}
function* _unionRanges(ranges: MatchRange[]): Generator<MatchRange> {
const sortedRanges = sortMatchRangeArray(ranges);

// fully excluded
if (b.startPos <= a.startPos && b.endPos >= a.endPos) {
return [];
}
if (!sortedRanges.length) return;

const result: MatchRange[] = [];
let { startPos, endPos } = sortedRanges[0];

if (a.startPos < b.startPos) {
result.push({ startPos: a.startPos, endPos: b.startPos });
for (const r of ranges) {
if (r.startPos > endPos) {
yield { startPos, endPos };
startPos = r.startPos;
endPos = r.endPos;
continue;
}
endPos = Math.max(endPos, r.endPos);
}

if (a.endPos > b.endPos) {
result.push({ startPos: b.endPos, endPos: a.endPos });
if (startPos < endPos) {
yield { startPos, endPos };
}
return result;
}

export function findMatchingRangesForPatterns(patterns: (string | RegExp)[], text: string): MatchRange[] {
const matchedPatterns = GS.genSequence(patterns).concatMap((pattern) => findMatchingRanges(pattern, text));
return unionRanges(matchedPatterns.toArray());
}

/**
* Create a new set of positions that have the excluded position ranges removed.
*/
export function excludeRanges(includeRanges: MatchRange[], excludeRanges: MatchRange[]): MatchRange[] {
type TInclude = 'i';
type TExclude = 'e';
return [..._excludeRanges(sortMatchRangeArray(includeRanges), sortMatchRangeArray(excludeRanges))];
}

interface MatchRangeWithType extends MatchRange {
type: TInclude | TExclude;
}
interface Result {
ranges: MatchRange[];
lastExclude?: MatchRange;
function* _excludeRanges(
includeRanges: SortedMatchRangeArray,
excludeRanges: SortedMatchRangeArray
): Generator<MatchRange, undefined, undefined> {
if (!includeRanges.length) return;
if (!excludeRanges.length) {
yield* includeRanges;
return;
}
const tInclude: TInclude = 'i';
const tExclude: TExclude = 'e';

const sortedRanges: MatchRangeWithType[] = [
...includeRanges.map((r) => ({ ...r, type: tInclude })),
...excludeRanges.map((r) => ({ ...r, type: tExclude })),
].sort(fnSortRanges);

const result = sortedRanges.reduce(
(acc: Result, range: MatchRangeWithType) => {
const { ranges, lastExclude } = acc;
const lastInclude = ranges.length ? ranges[ranges.length - 1] : undefined;
if (range.type === tExclude) {
if (!lastInclude || lastInclude.endPos <= range.startPos) {
// if the exclude is beyond the current include, save it for later
return { ranges, lastExclude: range };
}
// we need to split the current include.
return {
ranges: [...ranges.slice(0, -1), ...excludeRange(ranges[ranges.length - 1], range)],
lastExclude: range,
};
}

// The range is an include, we need to check it against the last exclude
if (!lastExclude) {
return { ranges: ranges.concat([range]) };
let exIndex = 0;
const limit = excludeRanges.length;

for (const incRange of includeRanges) {
const endPos = incRange.endPos;
let startPos = incRange.startPos;

for (; exIndex < limit; ++exIndex) {
const ex = excludeRanges[exIndex];
if (ex.startPos >= endPos) break;
if (ex.startPos > startPos) {
yield { startPos, endPos: ex.startPos };
}
const nextExclude = lastExclude.endPos > range.endPos ? lastExclude : undefined;
return {
ranges: [...ranges, ...excludeRange(range, lastExclude)],
lastExclude: nextExclude,
};
},
{ ranges: [] }
);
startPos = ex.endPos;
if (startPos >= endPos) break;
}

return result.ranges;
if (startPos < endPos) {
yield { startPos, endPos };
}
}
}

export function extractRangeText(text: string, ranges: MatchRange[]): MatchRangeWithText[] {
Expand All @@ -156,3 +128,30 @@ export function extractRangeText(text: string, ranges: MatchRange[]): MatchRange
text: text.slice(startPos, endPos),
}));
}

const SymSortedMatchRangeArray = Symbol('SortedMatchRangeArray');

interface SortedMatchRangeArray extends Array<MatchRange> {
[SymSortedMatchRangeArray]: true;
}

function sortMatchRangeArray(values: MatchRange[]): SortedMatchRangeArray {
if (isSortedMatchRangeArray(values)) return values;

return makeSortedMatchRangeArray(values.sort(compareRanges));
}

function isSortedMatchRangeArray(a: MatchRange[] | SortedMatchRangeArray): a is SortedMatchRangeArray {
return (<SortedMatchRangeArray>a)[SymSortedMatchRangeArray] === true;
}

function makeSortedMatchRangeArray(sortedValues: MatchRange[]): SortedMatchRangeArray {
const sorted: SortedMatchRangeArray = sortedValues as SortedMatchRangeArray;
sorted[SymSortedMatchRangeArray] = true;
Object.freeze(sorted);
return sorted;
}

export const __testing__ = {
makeSortedMatchRangeArray,
};
21 changes: 21 additions & 0 deletions test-fixtures/issues/issue-1775/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Performance bug

[Performance - took to long to spell check file - `utf_info.hxx` · Issue #1775 · streetsidesoftware/cspell](https://github.com/streetsidesoftware/cspell/issues/1775)

## Info

It took nearly 30 seconds to spell check a single file:
[nuspell/utf_info.hxx at master · nuspell/nuspell](https://github.com/nuspell/nuspell/blob/master/external/hunspell/hunspell/utf_info.hxx)

**Kind of Issue**

- [X] runtime - command-line tools

**Which Tool or library**

- [X] cspell -- the command-line spelling tool
- [X] cspell-lib -- library that does the actual spell checking.

**Which Version**

Version: 5.10.1
5 changes: 5 additions & 0 deletions test-fixtures/issues/issue-1775/cspell.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"words": [
"nuspell"
]
}
Loading

0 comments on commit a252ffe

Please sign in to comment.