From 264991115dd58600c6e1d5c7a294d1c1cc4af9ac Mon Sep 17 00:00:00 2001 From: Bjorn Lu Date: Wed, 3 Apr 2024 21:02:34 +0800 Subject: [PATCH] Refactor to improve performance w/ weak map, hoisted regex Closes GH-13. Reviewed-by: Titus Wormer --- lib/core.js | 47 ++++++++++++++++++++++++++++++-------- lib/util/to-decimal.js | 4 +++- lib/util/to-hexadecimal.js | 4 +++- lib/util/to-named.js | 4 +++- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/lib/core.js b/lib/core.js index d51fec5..69e9843 100644 --- a/lib/core.js +++ b/lib/core.js @@ -1,6 +1,6 @@ /** * @typedef CoreOptions - * @property {Array} [subset=[]] + * @property {ReadonlyArray} [subset=[]] * Whether to only escape the given subset of characters. * @property {boolean} [escapeOnly=false] * Whether to only escape possibly dangerous characters. @@ -13,6 +13,16 @@ * @typedef {CoreOptions & FormatOptions & import('./util/format-smart.js').FormatSmartOptions} CoreWithFormatOptions */ +const defaultSubsetRegex = /["&'<>`]/g +const surrogatePairsRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g +const controlCharactersRegex = + // eslint-disable-next-line no-control-regex, unicorn/no-hex-escape + /[\x01-\t\v\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g +const regexEscapeRegex = /[|\\{}()[\]^$+*?.]/g + +/** @type {WeakMap, RegExp>} */ +const subsetToRegexCache = new WeakMap() + /** * Encode certain characters in `value`. * @@ -22,7 +32,9 @@ */ export function core(value, options) { value = value.replace( - options.subset ? charactersToExpression(options.subset) : /["&'<>`]/g, + options.subset + ? charactersToExpressionCached(options.subset) + : defaultSubsetRegex, basic ) @@ -33,14 +45,10 @@ export function core(value, options) { return ( value // Surrogate pairs. - .replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g, surrogate) + .replace(surrogatePairsRegex, surrogate) // BMP control characters (C0 except for LF, CR, SP; DEL; and some more // non-ASCII ones). - .replace( - // eslint-disable-next-line no-control-regex, unicorn/no-hex-escape - /[\x01-\t\v\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g, - basic - ) + .replace(controlCharactersRegex, basic) ) /** @@ -74,7 +82,26 @@ export function core(value, options) { } /** - * @param {Array} subset + * A wrapper function that caches the result of `charactersToExpression` with a WeakMap. + * This can improve performance when tooling calls `charactersToExpression` repeatedly + * with the same subset. + * + * @param {ReadonlyArray} subset + * @returns {RegExp} + */ +function charactersToExpressionCached(subset) { + let cached = subsetToRegexCache.get(subset) + + if (!cached) { + cached = charactersToExpression(subset) + subsetToRegexCache.set(subset, cached) + } + + return cached +} + +/** + * @param {ReadonlyArray} subset * @returns {RegExp} */ function charactersToExpression(subset) { @@ -83,7 +110,7 @@ function charactersToExpression(subset) { let index = -1 while (++index < subset.length) { - groups.push(subset[index].replace(/[|\\{}()[\]^$+*?.]/g, '\\$&')) + groups.push(subset[index].replace(regexEscapeRegex, '\\$&')) } return new RegExp('(?:' + groups.join('|') + ')', 'g') diff --git a/lib/util/to-decimal.js b/lib/util/to-decimal.js index b08e119..f0f706c 100644 --- a/lib/util/to-decimal.js +++ b/lib/util/to-decimal.js @@ -1,3 +1,5 @@ +const decimalRegex = /\d/ + /** * Configurable ways to encode characters as decimal references. * @@ -8,7 +10,7 @@ */ export function toDecimal(code, next, omit) { const value = '&#' + String(code) - return omit && next && !/\d/.test(String.fromCharCode(next)) + return omit && next && !decimalRegex.test(String.fromCharCode(next)) ? value : value + ';' } diff --git a/lib/util/to-hexadecimal.js b/lib/util/to-hexadecimal.js index a354f30..0df6c68 100644 --- a/lib/util/to-hexadecimal.js +++ b/lib/util/to-hexadecimal.js @@ -1,3 +1,5 @@ +const hexadecimalRegex = /[\dA-Fa-f]/ + /** * Configurable ways to encode characters as hexadecimal references. * @@ -8,7 +10,7 @@ */ export function toHexadecimal(code, next, omit) { const value = '&#x' + code.toString(16).toUpperCase() - return omit && next && !/[\dA-Fa-f]/.test(String.fromCharCode(next)) + return omit && next && !hexadecimalRegex.test(String.fromCharCode(next)) ? value : value + ';' } diff --git a/lib/util/to-named.js b/lib/util/to-named.js index 51a4b44..01915e2 100644 --- a/lib/util/to-named.js +++ b/lib/util/to-named.js @@ -20,6 +20,8 @@ for (key in characterEntitiesHtml4) { } } +const notAlphanumericRegex = /[^\dA-Za-z]/ + /** * Configurable ways to encode characters as named references. * @@ -43,7 +45,7 @@ export function toNamed(code, next, omit, attribute) { (!attribute || (next && next !== 61 /* `=` */ && - /[^\da-z]/i.test(String.fromCharCode(next)))) + notAlphanumericRegex.test(String.fromCharCode(next)))) ) { return value }