From e8d03bcd0ae5167b43195052a31847fe21271153 Mon Sep 17 00:00:00 2001 From: Joe Hildebrand Date: Fri, 3 Jan 2025 02:24:18 -0700 Subject: [PATCH 1/5] Correctly handle unicode ranges This should correctly generate UTF-16 ranges for Peggy from any valid ABNF range. Fixes #25. --- .gitignore | 1 + bin/abnf_gen.js | 2 +- lib/ast.js | 119 ++++++++++++++++++++---- package.json | 18 ++-- pnpm-lock.yaml | 237 ++++++++++++++++++++++++------------------------ test/ast.ava.js | 26 ++++++ 6 files changed, 255 insertions(+), 148 deletions(-) diff --git a/.gitignore b/.gitignore index 73a8d49..0aacdd4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ examples/*.peggy examples/*.pest node_modules t.abnf +.cert/ diff --git a/bin/abnf_gen.js b/bin/abnf_gen.js index 70750b4..269a3d6 100755 --- a/bin/abnf_gen.js +++ b/bin/abnf_gen.js @@ -112,6 +112,6 @@ program }) .parseAsync() .catch(er => { - console.error(er.message); + console.error(er); process.exit(1); }); diff --git a/lib/ast.js b/lib/ast.js index 8ba5565..b9efbf3 100644 --- a/lib/ast.js +++ b/lib/ast.js @@ -42,8 +42,8 @@ function slug(s) { return s.replace(/-/g, "_"); } -function str(s) { - return `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{ +function str(s, opts) { + s = `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{ "\r": "r", "\n": "n", '"': '"', @@ -51,6 +51,19 @@ function str(s) { "\v": "x0B", "\\": "\\", }[c] || `x${c.charCodeAt(0).toString(16).padStart(2, "0")}`}`)}"`; + + if (opts?.format === "peggy") { + s = s.replace( + /[\u0100-\uffff]/, + c => `\\u${c.codePointAt(0).toString(16).padStart(4, 0)}` + ); + s = s.replace( + /[\u{10000}-\u{10ffff}]/u, + c => `\\u{${c.codePointAt(0).toString(16)}}` + ); + } + + return s; } function fromArray(opts, a, joiner, needed, parent) { @@ -132,7 +145,7 @@ export class CaseInsensitiveString extends Base { } toFormat(opts) { - const s = str(this.str); + const s = str(this.str, opts); if (this.str.match(/[a-z]/i)) { switch (opts.format) { case "peggy": @@ -155,8 +168,8 @@ export class CaseSensitiveString extends Base { this.base = base; } - toFormat(_opts) { - return str(this.str); + toFormat(opts) { + return str(this.str, opts); } } @@ -299,21 +312,92 @@ export class Range extends Base { } static create(base, first, last, loc, utf16 = true) { - if (utf16 && (first <= 0xffff) && (last === 0x10ffff)) { - // Special case "all high Unicode" since it shows up a lot - // This should be generalized. + if (first > last) { + throw new Error(`Range out of order ${first.toString(16)}-${last.toString(16)}`); + } + if (first === last) { + return new CaseSensitiveString(String.fromCodePoint(first), base, loc); + } + if (utf16) { + if (last < 0xd800) { + return new Range(base, first, last, loc); + } + if ((first > 0xd7ff) && (first < 0xe000) && (last < 0xe000)) { + throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`); + } + + // Remove the range 0xd800-0xdfff. These possible ranges remain: + // - first-0xd7ff + // - 0xe000-0xffff + // - 0x10000-last const alts = []; + if (first < 0xd800) { - alts.push(new Range(base, first, 0xd7ff, loc)); - alts.push(new Range(base, 0xe000, 0xffff, loc)); - } else { - alts.push(new Range(base, first, 0xffff, loc)); + alts.push(Range.create(base, first, 0xd7ff, loc, false)); } - alts.push(new Concatenation([ - new Range(base, 0xd800, 0xdbff, loc), - new Range(base, 0xdc00, 0xdfff, loc), - ], loc)); + if (last < 0x10000) { + alts.push( + Range.create(base, Math.max(0xe000, first), last, loc, false) + ); + } else { + if (first < 0x10000) { + alts.push( + Range.create(base, Math.max(0xe000, first), 0xffff, loc, false) + ); + first = 0x10000; + } + // This code follows the logic in regenerate: + // https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996 + // I didn't use regenerate directly because: + // a) I only needed a small part of it + // b) Regenerate will only generate a string, which I then would have + // to parse to get the info I needed out. + // I believe this use is within the spirit of the MIT license. + const firstH = Math.floor((first - 0x10000) / 0x400) + 0xd800; + const firstL = ((first - 0x10000) % 0x400) + 0xdc00; + const lastH = Math.floor((last - 0x10000) / 0x400) + 0xd800; + const lastL = ((last - 0x10000) % 0x400) + 0xdc00; + let complete = false; + if ( + (firstH === lastH) + || ((firstL === 0xdc00) && (lastL === 0xdfff)) + ) { + alts.push(new Concatenation([ + Range.create(base, firstH, lastH, loc, false), + Range.create(base, firstL, lastL, loc, false), + ], loc)); + complete = true; + } else { + alts.push(new Concatenation([ + Range.create(base, firstH, firstH, loc, false), + Range.create(base, firstL, 0xdfff, loc, false), + ], loc)); + } + if (!complete && (firstH + 1 < lastH)) { + if (lastL === 0xdfff) { + alts.push(new Concatenation([ + Range.create(base, firstH + 1, lastH + 1, loc, false), + Range.create(base, 0xdc00, lastL, loc, false), + ], loc)); + complete = true; + } else { + alts.push(new Concatenation([ + Range.create(base, firstH + 1, lastH - 1, loc, false), + Range.create(base, 0xdc00, 0xdfff, loc, false), + ], loc)); + } + } + if (!complete) { + alts.push(new Concatenation([ + Range.create(base, lastH, lastH, loc, false), + Range.create(base, 0xdc00, lastL, loc, false), + ], loc)); + } + } + if (alts.length === 1) { + return alts[0]; + } return new Alternation(alts, loc); } return new Range(base, first, last, loc); @@ -339,6 +423,9 @@ export class Range extends Base { const { rangeBefore, rangeAfter, rangeSep } = delims(opts); const first = Range.escape(opts, this.first); const last = Range.escape(opts, this.last); + if (this.first + 1 === this.last) { + return `${rangeBefore}${first}${last}${rangeAfter}`; + } return `${rangeBefore}${first}${rangeSep}${last}${rangeAfter}`; } } diff --git a/package.json b/package.json index 4d30a31..30499bb 100644 --- a/package.json +++ b/package.json @@ -50,26 +50,20 @@ "ci": "npm run coverage && npm run lint" }, "dependencies": { - "commander": "^12.1.0", + "commander": "^13.0.0", "peggy": "^4.2.0" }, "devDependencies": { "@peggyjs/coverage": "1.3.2", - "@peggyjs/eslint-config": "^5.0.1", - "@typescript-eslint/eslint-plugin": "^8.18.0", - "@typescript-eslint/parser": "^8.18.0", + "@peggyjs/eslint-config": "^5.0.3", + "@typescript-eslint/eslint-plugin": "^8.19.0", + "@typescript-eslint/parser": "^8.19.0", "ava": "6.2.0", "c8": "10.1.3", - "eslint": "^9.16.0", + "eslint": "^9.17.0", "typescript": "^5.7.2" }, - "packageManager": "pnpm@9.15.0", - "pnpm": { - "overrides": { - "cross-spawn": "^7.0.6", - "@eslint/plugin-kit": "^0.2.4" - } - }, + "packageManager": "pnpm@9.15.2", "engines": { "node": ">=18" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 99d9146..bc2edfd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -4,17 +4,13 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false -overrides: - cross-spawn: ^7.0.6 - '@eslint/plugin-kit': ^0.2.4 - importers: .: dependencies: commander: - specifier: ^12.1.0 - version: 12.1.0 + specifier: ^13.0.0 + version: 13.0.0 peggy: specifier: ^4.2.0 version: 4.2.0 @@ -23,14 +19,14 @@ importers: specifier: 1.3.2 version: 1.3.2 '@peggyjs/eslint-config': - specifier: ^5.0.1 - version: 5.0.1(eslint@9.16.0)(typescript@5.7.2) + specifier: ^5.0.3 + version: 5.0.3(eslint@9.17.0)(typescript@5.7.2) '@typescript-eslint/eslint-plugin': - specifier: ^8.18.0 - version: 8.18.0(@typescript-eslint/parser@8.18.0(eslint@9.16.0)(typescript@5.7.2))(eslint@9.16.0)(typescript@5.7.2) + specifier: ^8.19.0 + version: 8.19.0(@typescript-eslint/parser@8.19.0(eslint@9.17.0)(typescript@5.7.2))(eslint@9.17.0)(typescript@5.7.2) '@typescript-eslint/parser': - specifier: ^8.18.0 - version: 8.18.0(eslint@9.16.0)(typescript@5.7.2) + specifier: ^8.19.0 + version: 8.19.0(eslint@9.17.0)(typescript@5.7.2) ava: specifier: 6.2.0 version: 6.2.0 @@ -38,8 +34,8 @@ importers: specifier: 10.1.3 version: 10.1.3 eslint: - specifier: ^9.16.0 - version: 9.16.0 + specifier: ^9.17.0 + version: 9.17.0 typescript: specifier: ^5.7.2 version: 5.7.2 @@ -76,12 +72,12 @@ packages: resolution: {integrity: sha512-grOjVNN8P3hjJn/eIETF1wwd12DdnwFDoyceUJLYYdkpbwq3nLi+4fqrTAONx7XDALqlL220wC/RHSC/QTI/0w==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@eslint/js@9.16.0': - resolution: {integrity: sha512-tw2HxzQkrbeuvyj1tG2Yqq+0H9wGoI2IMk4EOsQeX+vmd75FtJAzf+gTA69WF+baUKRYQ3x2kbLE08js5OsTVg==} + '@eslint/js@9.17.0': + resolution: {integrity: sha512-Sxc4hqcs1kTu0iID3kcZDW3JHq2a77HO9P8CP6YEA/FpH3Ll8UXE2r/86Rz9YJLKme39S9vU5OWNjC6Xl0Cr3w==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@eslint/json@0.6.0': - resolution: {integrity: sha512-xlYoULv2QIeJnjFP4RVbPMpaGplsYo0vSIBpXP/QRnoi7oDYhVZ4u3wE5UUwI8hnhTQUMozrDhyuVFXMQ1HkMQ==} + '@eslint/json@0.9.0': + resolution: {integrity: sha512-PTLD0Kp7+BKhTthodns+hFbuZZ+hjb3lc/iVAg7mtBAnW5hLJhkST9O4m21oMkxG94GR2+GGZQNNurG9KP8pNA==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} '@eslint/markdown@6.2.1': @@ -108,8 +104,8 @@ packages: resolution: {integrity: sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==} engines: {node: '>=12.22'} - '@humanwhocodes/momoa@3.3.2': - resolution: {integrity: sha512-aGWs0rZtc0AFFfJIxWqgo2gtZmnJPF6wqCI0BTziVtx1kAnl3+/KRVxHoyFphQCYn4t+Dh/iECSSGfn5z0k+gQ==} + '@humanwhocodes/momoa@3.3.5': + resolution: {integrity: sha512-NI9codbQNjw9g4SS/cOizi8JDZ93B3oGVko8M3y0XF3gITaGDSQqea35V8fswWehnRQBLxPfZY5TJnuNhNCEzA==} engines: {node: '>=18'} '@humanwhocodes/retry@0.3.0': @@ -158,8 +154,8 @@ packages: resolution: {integrity: sha512-8yh9Bbq9EtDXo/jVBL7PGUzo6tC69cK+3pARDq7vwlksLMPOcUQkJfnHS58U2YeBqNvB4OgB1O4FWH8xPNvVTA==} engines: {node: '>=18'} - '@peggyjs/eslint-config@5.0.1': - resolution: {integrity: sha512-CpLB1zujhUKMXd13DlM8TXkj3B3CnJyC1fhfKKv4f0+2IGeQMSp0VraX2Wx9vccGagdY3+dJfNN8V1DWed5QAA==} + '@peggyjs/eslint-config@5.0.3': + resolution: {integrity: sha512-5gsUZ3SK26zTCHVBwPGtAEn7KTN8x93QhwiWzcXNda3o3vuWf5ckj0j76D1KjzHrLAS1Ipjo6+X7AI5wc86XZg==} engines: {node: '>=18'} '@peggyjs/from-mem@1.3.5': @@ -178,8 +174,8 @@ packages: resolution: {integrity: sha512-LtoMMhxAlorcGhmFYI+LhPgbPZCkgP6ra1YL604EeF6U98pLlQ3iWIGMdWSC+vWmPBWBNgmDBAhnAobLROJmwg==} engines: {node: '>=18'} - '@stylistic/eslint-plugin@2.10.1': - resolution: {integrity: sha512-U+4yzNXElTf9q0kEfnloI9XbOyD4cnEQCxjUI94q0+W++0GAEQvJ/slwEj9lwjDHfGADRSr+Tco/z0XJvmDfCQ==} + '@stylistic/eslint-plugin@2.12.1': + resolution: {integrity: sha512-fubZKIHSPuo07FgRTn6S4Nl0uXPRPYVNpyZzIDGfp7Fny6JjNus6kReLD7NI380JXi4HtUTSOZ34LBuNPO1XLQ==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} peerDependencies: eslint: '>=8.40.0' @@ -205,52 +201,43 @@ packages: '@types/unist@3.0.3': resolution: {integrity: sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==} - '@typescript-eslint/eslint-plugin@8.18.0': - resolution: {integrity: sha512-NR2yS7qUqCL7AIxdJUQf2MKKNDVNaig/dEB0GBLU7D+ZdHgK1NoH/3wsgO3OnPVipn51tG3MAwaODEGil70WEw==} + '@typescript-eslint/eslint-plugin@8.19.0': + resolution: {integrity: sha512-NggSaEZCdSrFddbctrVjkVZvFC6KGfKfNK0CU7mNK/iKHGKbzT4Wmgm08dKpcZECBu9f5FypndoMyRHkdqfT1Q==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} peerDependencies: '@typescript-eslint/parser': ^8.0.0 || ^8.0.0-alpha.0 eslint: ^8.57.0 || ^9.0.0 typescript: '>=4.8.4 <5.8.0' - '@typescript-eslint/parser@8.18.0': - resolution: {integrity: sha512-hgUZ3kTEpVzKaK3uNibExUYm6SKKOmTU2BOxBSvOYwtJEPdVQ70kZJpPjstlnhCHcuc2WGfSbpKlb/69ttyN5Q==} + '@typescript-eslint/parser@8.19.0': + resolution: {integrity: sha512-6M8taKyOETY1TKHp0x8ndycipTVgmp4xtg5QpEZzXxDhNvvHOJi5rLRkLr8SK3jTgD5l4fTlvBiRdfsuWydxBw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} peerDependencies: eslint: ^8.57.0 || ^9.0.0 typescript: '>=4.8.4 <5.8.0' - '@typescript-eslint/scope-manager@8.13.0': - resolution: {integrity: sha512-XsGWww0odcUT0gJoBZ1DeulY1+jkaHUciUq4jKNv4cpInbvvrtDoyBH9rE/n2V29wQJPk8iCH1wipra9BhmiMA==} - engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@typescript-eslint/scope-manager@8.18.0': resolution: {integrity: sha512-PNGcHop0jkK2WVYGotk/hxj+UFLhXtGPiGtiaWgVBVP1jhMoMCHlTyJA+hEj4rszoSdLTK3fN4oOatrL0Cp+Xw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@typescript-eslint/type-utils@8.18.0': - resolution: {integrity: sha512-er224jRepVAVLnMF2Q7MZJCq5CsdH2oqjP4dT7K6ij09Kyd+R21r7UVJrF0buMVdZS5QRhDzpvzAxHxabQadow==} + '@typescript-eslint/scope-manager@8.19.0': + resolution: {integrity: sha512-hkoJiKQS3GQ13TSMEiuNmSCvhz7ujyqD1x3ShbaETATHrck+9RaDdUbt+osXaUuns9OFwrDTTrjtwsU8gJyyRA==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + + '@typescript-eslint/type-utils@8.19.0': + resolution: {integrity: sha512-TZs0I0OSbd5Aza4qAMpp1cdCYVnER94IziudE3JU328YUHgWu9gwiwhag+fuLeJ2LkWLXI+F/182TbG+JaBdTg==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} peerDependencies: eslint: ^8.57.0 || ^9.0.0 typescript: '>=4.8.4 <5.8.0' - '@typescript-eslint/types@8.13.0': - resolution: {integrity: sha512-4cyFErJetFLckcThRUFdReWJjVsPCqyBlJTi6IDEpc1GWCIIZRFxVppjWLIMcQhNGhdWJJRYFHpHoDWvMlDzng==} - engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@typescript-eslint/types@8.18.0': resolution: {integrity: sha512-FNYxgyTCAnFwTrzpBGq+zrnoTO4x0c1CKYY5MuUTzpScqmY5fmsh2o3+57lqdI3NZucBDCzDgdEbIaNfAjAHQA==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@typescript-eslint/typescript-estree@8.13.0': - resolution: {integrity: sha512-v7SCIGmVsRK2Cy/LTLGN22uea6SaUIlpBcO/gnMGT/7zPtxp90bphcGf4fyrCQl3ZtiBKqVTG32hb668oIYy1g==} + '@typescript-eslint/types@8.19.0': + resolution: {integrity: sha512-8XQ4Ss7G9WX8oaYvD4OOLCjIQYgRQxO+qCiR2V2s2GxI9AUpo7riNwo6jDhKtTcaJjT8PY54j2Yb33kWtSJsmA==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - peerDependencies: - typescript: '*' - peerDependenciesMeta: - typescript: - optional: true '@typescript-eslint/typescript-estree@8.18.0': resolution: {integrity: sha512-rqQgFRu6yPkauz+ms3nQpohwejS8bvgbPyIDq13cgEDbkXt4LH4OkDMT0/fN1RUtzG8e8AKJyDBoocuQh8qNeg==} @@ -258,11 +245,11 @@ packages: peerDependencies: typescript: '>=4.8.4 <5.8.0' - '@typescript-eslint/utils@8.13.0': - resolution: {integrity: sha512-A1EeYOND6Uv250nybnLZapeXpYMl8tkzYUxqmoKAWnI4sei3ihf2XdZVd+vVOmHGcp3t+P7yRrNsyyiXTvShFQ==} + '@typescript-eslint/typescript-estree@8.19.0': + resolution: {integrity: sha512-WW9PpDaLIFW9LCbucMSdYUuGeFUz1OkWYS/5fwZwTA+l2RwlWFdJvReQqMUMBw4yJWJOfqd7An9uwut2Oj8sLw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} peerDependencies: - eslint: ^8.57.0 || ^9.0.0 + typescript: '>=4.8.4 <5.8.0' '@typescript-eslint/utils@8.18.0': resolution: {integrity: sha512-p6GLdY383i7h5b0Qrfbix3Vc3+J2k6QWw6UMUeY5JGfm3C5LbZ4QIZzJNoNOfgyRe0uuYKjvVOsO/jD4SJO+xg==} @@ -271,14 +258,21 @@ packages: eslint: ^8.57.0 || ^9.0.0 typescript: '>=4.8.4 <5.8.0' - '@typescript-eslint/visitor-keys@8.13.0': - resolution: {integrity: sha512-7N/+lztJqH4Mrf0lb10R/CbI1EaAMMGyF5y0oJvFoAhafwgiRA7TXyd8TFn8FC8k5y2dTsYogg238qavRGNnlw==} + '@typescript-eslint/utils@8.19.0': + resolution: {integrity: sha512-PTBG+0oEMPH9jCZlfg07LCB2nYI0I317yyvXGfxnvGvw4SHIOuRnQ3kadyyXY6tGdChusIHIbM5zfIbp4M6tCg==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + peerDependencies: + eslint: ^8.57.0 || ^9.0.0 + typescript: '>=4.8.4 <5.8.0' '@typescript-eslint/visitor-keys@8.18.0': resolution: {integrity: sha512-pCh/qEA8Lb1wVIqNvBke8UaRjJ6wrAWkJO5yyIbs8Yx6TNGYyfNjOo61tLv+WwLvoLPp4BQ8B7AHKijl8NGUfw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + '@typescript-eslint/visitor-keys@8.19.0': + resolution: {integrity: sha512-mCFtBbFBJDCNCWUl5y6sZSCHXw1DEFEk3c/M3nRK2a4XUB8StGFtmcEMizdjKuBzB6e/smJAAWYug3VrdLMr1w==} + engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + '@vercel/nft@0.27.6': resolution: {integrity: sha512-mwuyUxskdcV8dd7N7JnxBgvFEz1D9UOePI/WyLLzktv6HSCwgPNQGit/UJ2IykAWGlypKw4pBQjOKWvIbXITSg==} engines: {node: '>=16'} @@ -464,6 +458,10 @@ packages: resolution: {integrity: sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==} engines: {node: '>=18'} + commander@13.0.0: + resolution: {integrity: sha512-oPYleIY8wmTVzkvQq10AEok6YcTC4sRUBl8F9gVuwchGVUCTbl/vhLTaQqutuuySYOsu8YTgV+OxKc/8Yvx+mQ==} + engines: {node: '>=18'} + common-path-prefix@3.0.0: resolution: {integrity: sha512-QE33hToZseCH3jS0qN96O/bSh3kaw/h+Tq7ngyY9eWDUnTlTNUyqfqvCXioLe5Na5jFsL78ra/wuBU4iuEgd4w==} @@ -569,8 +567,8 @@ packages: resolution: {integrity: sha512-UyLnSehNt62FFhSwjZlHmeokpRK59rcz29j+F1/aDgbkbRTk7wIc9XzdoasMUbRNKDM0qQt/+BJ4BrpFeABemw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - eslint@9.16.0: - resolution: {integrity: sha512-whp8mSQI4C8VXd+fLgSM0lh3UlmcFtVwUQjyKCFfsp+2ItAIYhlq/hqGahGqHE6cv9unM41VlqKk2VtKYR2TaA==} + eslint@9.17.0: + resolution: {integrity: sha512-evtlNcpJg+cZLcnVKwsai8fExnqjGPicK7gnUtlNuzu+Fv9bI0aLpND5T44VLQtoMEnI57LoXO9XAkIXwohKrA==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} hasBin: true peerDependencies: @@ -701,8 +699,8 @@ packages: resolution: {integrity: sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==} engines: {node: '>=18'} - globals@15.12.0: - resolution: {integrity: sha512-1+gLErljJFhbOVyaetcwJiJ4+eLe45S2E7P5UiZ9xGfeq3ATQf5DOv9G7MH3gGbKQLkzmNh2DxfZwLdw+j6oTQ==} + globals@15.14.0: + resolution: {integrity: sha512-OkToC372DtlQeje9/zHIo5CT8lRP/FUgEOKBEhU4e0abL7J7CD24fD9ohiLN5hagG/kWCYj4K5oaxxtj2Z0Dig==} engines: {node: '>=18'} globby@14.0.2: @@ -1416,9 +1414,9 @@ snapshots: '@bcoe/v8-coverage@1.0.1': {} - '@eslint-community/eslint-utils@4.4.0(eslint@9.16.0)': + '@eslint-community/eslint-utils@4.4.0(eslint@9.17.0)': dependencies: - eslint: 9.16.0 + eslint: 9.17.0 eslint-visitor-keys: 3.4.3 '@eslint-community/regexpp@4.12.1': {} @@ -1449,12 +1447,12 @@ snapshots: transitivePeerDependencies: - supports-color - '@eslint/js@9.16.0': {} + '@eslint/js@9.17.0': {} - '@eslint/json@0.6.0': + '@eslint/json@0.9.0': dependencies: '@eslint/plugin-kit': 0.2.4 - '@humanwhocodes/momoa': 3.3.2 + '@humanwhocodes/momoa': 3.3.5 '@eslint/markdown@6.2.1': dependencies: @@ -1480,7 +1478,7 @@ snapshots: '@humanwhocodes/module-importer@1.0.1': {} - '@humanwhocodes/momoa@3.3.2': {} + '@humanwhocodes/momoa@3.3.5': {} '@humanwhocodes/retry@0.3.0': {} @@ -1538,12 +1536,12 @@ snapshots: peggy: 4.2.0 source-map-generator: 0.8.0 - '@peggyjs/eslint-config@5.0.1(eslint@9.16.0)(typescript@5.7.2)': + '@peggyjs/eslint-config@5.0.3(eslint@9.17.0)(typescript@5.7.2)': dependencies: - '@eslint/json': 0.6.0 + '@eslint/json': 0.9.0 '@eslint/markdown': 6.2.1 - '@stylistic/eslint-plugin': 2.10.1(eslint@9.16.0)(typescript@5.7.2) - globals: 15.12.0 + '@stylistic/eslint-plugin': 2.12.1(eslint@9.17.0)(typescript@5.7.2) + globals: 15.14.0 transitivePeerDependencies: - eslint - supports-color @@ -1563,10 +1561,10 @@ snapshots: '@sindresorhus/merge-streams@2.3.0': {} - '@stylistic/eslint-plugin@2.10.1(eslint@9.16.0)(typescript@5.7.2)': + '@stylistic/eslint-plugin@2.12.1(eslint@9.17.0)(typescript@5.7.2)': dependencies: - '@typescript-eslint/utils': 8.13.0(eslint@9.16.0)(typescript@5.7.2) - eslint: 9.16.0 + '@typescript-eslint/utils': 8.18.0(eslint@9.17.0)(typescript@5.7.2) + eslint: 9.17.0 eslint-visitor-keys: 4.2.0 espree: 10.3.0 estraverse: 5.3.0 @@ -1593,15 +1591,15 @@ snapshots: '@types/unist@3.0.3': {} - '@typescript-eslint/eslint-plugin@8.18.0(@typescript-eslint/parser@8.18.0(eslint@9.16.0)(typescript@5.7.2))(eslint@9.16.0)(typescript@5.7.2)': + '@typescript-eslint/eslint-plugin@8.19.0(@typescript-eslint/parser@8.19.0(eslint@9.17.0)(typescript@5.7.2))(eslint@9.17.0)(typescript@5.7.2)': dependencies: '@eslint-community/regexpp': 4.12.1 - '@typescript-eslint/parser': 8.18.0(eslint@9.16.0)(typescript@5.7.2) - '@typescript-eslint/scope-manager': 8.18.0 - '@typescript-eslint/type-utils': 8.18.0(eslint@9.16.0)(typescript@5.7.2) - '@typescript-eslint/utils': 8.18.0(eslint@9.16.0)(typescript@5.7.2) - '@typescript-eslint/visitor-keys': 8.18.0 - eslint: 9.16.0 + '@typescript-eslint/parser': 8.19.0(eslint@9.17.0)(typescript@5.7.2) + '@typescript-eslint/scope-manager': 8.19.0 + '@typescript-eslint/type-utils': 8.19.0(eslint@9.17.0)(typescript@5.7.2) + '@typescript-eslint/utils': 8.19.0(eslint@9.17.0)(typescript@5.7.2) + '@typescript-eslint/visitor-keys': 8.19.0 + eslint: 9.17.0 graphemer: 1.4.0 ignore: 5.3.1 natural-compare: 1.4.0 @@ -1610,62 +1608,61 @@ snapshots: transitivePeerDependencies: - supports-color - '@typescript-eslint/parser@8.18.0(eslint@9.16.0)(typescript@5.7.2)': + '@typescript-eslint/parser@8.19.0(eslint@9.17.0)(typescript@5.7.2)': dependencies: - '@typescript-eslint/scope-manager': 8.18.0 - '@typescript-eslint/types': 8.18.0 - '@typescript-eslint/typescript-estree': 8.18.0(typescript@5.7.2) - '@typescript-eslint/visitor-keys': 8.18.0 + '@typescript-eslint/scope-manager': 8.19.0 + '@typescript-eslint/types': 8.19.0 + '@typescript-eslint/typescript-estree': 8.19.0(typescript@5.7.2) + '@typescript-eslint/visitor-keys': 8.19.0 debug: 4.3.7 - eslint: 9.16.0 + eslint: 9.17.0 typescript: 5.7.2 transitivePeerDependencies: - supports-color - '@typescript-eslint/scope-manager@8.13.0': - dependencies: - '@typescript-eslint/types': 8.13.0 - '@typescript-eslint/visitor-keys': 8.13.0 - '@typescript-eslint/scope-manager@8.18.0': dependencies: '@typescript-eslint/types': 8.18.0 '@typescript-eslint/visitor-keys': 8.18.0 - '@typescript-eslint/type-utils@8.18.0(eslint@9.16.0)(typescript@5.7.2)': + '@typescript-eslint/scope-manager@8.19.0': dependencies: - '@typescript-eslint/typescript-estree': 8.18.0(typescript@5.7.2) - '@typescript-eslint/utils': 8.18.0(eslint@9.16.0)(typescript@5.7.2) + '@typescript-eslint/types': 8.19.0 + '@typescript-eslint/visitor-keys': 8.19.0 + + '@typescript-eslint/type-utils@8.19.0(eslint@9.17.0)(typescript@5.7.2)': + dependencies: + '@typescript-eslint/typescript-estree': 8.19.0(typescript@5.7.2) + '@typescript-eslint/utils': 8.19.0(eslint@9.17.0)(typescript@5.7.2) debug: 4.3.7 - eslint: 9.16.0 + eslint: 9.17.0 ts-api-utils: 1.3.0(typescript@5.7.2) typescript: 5.7.2 transitivePeerDependencies: - supports-color - '@typescript-eslint/types@8.13.0': {} - '@typescript-eslint/types@8.18.0': {} - '@typescript-eslint/typescript-estree@8.13.0(typescript@5.7.2)': + '@typescript-eslint/types@8.19.0': {} + + '@typescript-eslint/typescript-estree@8.18.0(typescript@5.7.2)': dependencies: - '@typescript-eslint/types': 8.13.0 - '@typescript-eslint/visitor-keys': 8.13.0 + '@typescript-eslint/types': 8.18.0 + '@typescript-eslint/visitor-keys': 8.18.0 debug: 4.3.7 fast-glob: 3.3.2 is-glob: 4.0.3 minimatch: 9.0.4 semver: 7.6.3 ts-api-utils: 1.3.0(typescript@5.7.2) - optionalDependencies: typescript: 5.7.2 transitivePeerDependencies: - supports-color - '@typescript-eslint/typescript-estree@8.18.0(typescript@5.7.2)': + '@typescript-eslint/typescript-estree@8.19.0(typescript@5.7.2)': dependencies: - '@typescript-eslint/types': 8.18.0 - '@typescript-eslint/visitor-keys': 8.18.0 + '@typescript-eslint/types': 8.19.0 + '@typescript-eslint/visitor-keys': 8.19.0 debug: 4.3.7 fast-glob: 3.3.2 is-glob: 4.0.3 @@ -1676,38 +1673,38 @@ snapshots: transitivePeerDependencies: - supports-color - '@typescript-eslint/utils@8.13.0(eslint@9.16.0)(typescript@5.7.2)': + '@typescript-eslint/utils@8.18.0(eslint@9.17.0)(typescript@5.7.2)': dependencies: - '@eslint-community/eslint-utils': 4.4.0(eslint@9.16.0) - '@typescript-eslint/scope-manager': 8.13.0 - '@typescript-eslint/types': 8.13.0 - '@typescript-eslint/typescript-estree': 8.13.0(typescript@5.7.2) - eslint: 9.16.0 - transitivePeerDependencies: - - supports-color - - typescript - - '@typescript-eslint/utils@8.18.0(eslint@9.16.0)(typescript@5.7.2)': - dependencies: - '@eslint-community/eslint-utils': 4.4.0(eslint@9.16.0) + '@eslint-community/eslint-utils': 4.4.0(eslint@9.17.0) '@typescript-eslint/scope-manager': 8.18.0 '@typescript-eslint/types': 8.18.0 '@typescript-eslint/typescript-estree': 8.18.0(typescript@5.7.2) - eslint: 9.16.0 + eslint: 9.17.0 typescript: 5.7.2 transitivePeerDependencies: - supports-color - '@typescript-eslint/visitor-keys@8.13.0': + '@typescript-eslint/utils@8.19.0(eslint@9.17.0)(typescript@5.7.2)': dependencies: - '@typescript-eslint/types': 8.13.0 - eslint-visitor-keys: 3.4.3 + '@eslint-community/eslint-utils': 4.4.0(eslint@9.17.0) + '@typescript-eslint/scope-manager': 8.19.0 + '@typescript-eslint/types': 8.19.0 + '@typescript-eslint/typescript-estree': 8.19.0(typescript@5.7.2) + eslint: 9.17.0 + typescript: 5.7.2 + transitivePeerDependencies: + - supports-color '@typescript-eslint/visitor-keys@8.18.0': dependencies: '@typescript-eslint/types': 8.18.0 eslint-visitor-keys: 4.2.0 + '@typescript-eslint/visitor-keys@8.19.0': + dependencies: + '@typescript-eslint/types': 8.19.0 + eslint-visitor-keys: 4.2.0 + '@vercel/nft@0.27.6': dependencies: '@mapbox/node-pre-gyp': 1.0.11 @@ -1919,6 +1916,8 @@ snapshots: commander@12.1.0: {} + commander@13.0.0: {} + common-path-prefix@3.0.0: {} concat-map@0.0.1: {} @@ -2001,14 +2000,14 @@ snapshots: eslint-visitor-keys@4.2.0: {} - eslint@9.16.0: + eslint@9.17.0: dependencies: - '@eslint-community/eslint-utils': 4.4.0(eslint@9.16.0) + '@eslint-community/eslint-utils': 4.4.0(eslint@9.17.0) '@eslint-community/regexpp': 4.12.1 '@eslint/config-array': 0.19.1 '@eslint/core': 0.9.1 '@eslint/eslintrc': 3.2.0 - '@eslint/js': 9.16.0 + '@eslint/js': 9.17.0 '@eslint/plugin-kit': 0.2.4 '@humanfs/node': 0.16.6 '@humanwhocodes/module-importer': 1.0.1 @@ -2164,7 +2163,7 @@ snapshots: globals@14.0.0: {} - globals@15.12.0: {} + globals@15.14.0: {} globby@14.0.2: dependencies: diff --git a/test/ast.ava.js b/test/ast.ava.js index c2f7ed9..0e09577 100644 --- a/test/ast.ava.js +++ b/test/ast.ava.js @@ -15,6 +15,32 @@ test("range escape", t => { t.is(ast.Range.escape(opts, 0x100), "'\\u{100}'"); }); +test("range utf16", t => { + t.throws(() => ast.Range.create(16, 5, 4, {})); + let r = ast.Range.create(16, 20, 20, {}); + t.is(r.type, "caseSensitveString"); + t.throws(() => ast.Range.create(16, 0xd801, 0xd805, {})); + r = ast.Range.create(16, 0xe001, 0xffff, {}); + t.is(r.type, "range"); + t.is(r.first, 0xe001); + t.is(r.last, 0xffff); + r = ast.Range.create(16, 0x10004, 0x10401, {}); + t.is( + r.toFormat({ format: "peggy" }), + '"\\ud800" [\\udc04-\\udfff] / "\\ud801" [\\udc00\\udc01]' + ); + r = ast.Range.create(16, 0x10004, 0x10c01, {}); + t.is( + r.toFormat({ format: "peggy" }), + '"\\ud800" [\\udc04-\\udfff] / [\\ud801\\ud802] [\\udc00-\\udfff] / "\\ud803" [\\udc00\\udc01]' + ); + r = ast.Range.create(16, 0x10004, 0x10bff, {}); + t.is( + r.toFormat({ format: "peggy" }), + '"\\ud800" [\\udc04-\\udfff] / [\\ud801-\\ud803] [\\udc00-\\udfff]' + ); +}); + test("bad base class types", t => { t.throws(() => new ast.Base()); t.throws(() => new ast.Range("", 0, 1)); From bab38bbedc3b2c4e9e48ff357df20a6afcb14690 Mon Sep 17 00:00:00 2001 From: Joe Hildebrand Date: Fri, 3 Jan 2025 12:04:02 -0700 Subject: [PATCH 2/5] Only coalesce small ranges in peggy format --- lib/ast.js | 2 +- test/ast.ava.js | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/ast.js b/lib/ast.js index b9efbf3..1897209 100644 --- a/lib/ast.js +++ b/lib/ast.js @@ -423,7 +423,7 @@ export class Range extends Base { const { rangeBefore, rangeAfter, rangeSep } = delims(opts); const first = Range.escape(opts, this.first); const last = Range.escape(opts, this.last); - if (this.first + 1 === this.last) { + if ((opts.format === "peggy") && (this.first + 1 === this.last)) { return `${rangeBefore}${first}${last}${rangeAfter}`; } return `${rangeBefore}${first}${rangeSep}${last}${rangeAfter}`; diff --git a/test/ast.ava.js b/test/ast.ava.js index 0e09577..e834197 100644 --- a/test/ast.ava.js +++ b/test/ast.ava.js @@ -39,6 +39,16 @@ test("range utf16", t => { r.toFormat({ format: "peggy" }), '"\\ud800" [\\udc04-\\udfff] / [\\ud801-\\ud803] [\\udc00-\\udfff]' ); + r = ast.Range.create(16, 0, 0x10ffff, {}); + t.is( + r.toFormat({ format: "peggy" }), + "[\\x00-\\ud7ff] / [\\ue000-\\uffff] / [\\ud800-\\udbff] [\\udc00-\\udfff]" + ); + r = ast.Range.create(16, 0xee, 0xef, {}); + t.is( + r.toFormat({ format: "pest" }), + "'\\u{ee}'..'\\u{ef}'" + ); }); test("bad base class types", t => { From 051a33f858faf5f96819b7347e0997b388a47ac0 Mon Sep 17 00:00:00 2001 From: Joe Hildebrand Date: Fri, 3 Jan 2025 12:29:26 -0700 Subject: [PATCH 3/5] Also escape high characters in pest --- lib/ast.js | 5 +++++ test/ast.ava.js | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/lib/ast.js b/lib/ast.js index 1897209..3cdf4b1 100644 --- a/lib/ast.js +++ b/lib/ast.js @@ -61,6 +61,11 @@ function str(s, opts) { /[\u{10000}-\u{10ffff}]/u, c => `\\u{${c.codePointAt(0).toString(16)}}` ); + } else if (opts?.format === "pest") { + s = s.replace( + /[\u{ff}-\u{10ffff}]/u, + c => `\\u{${c.codePointAt(0).toString(16)}}` + ); } return s; diff --git a/test/ast.ava.js b/test/ast.ava.js index e834197..88a54f3 100644 --- a/test/ast.ava.js +++ b/test/ast.ava.js @@ -39,6 +39,11 @@ test("range utf16", t => { r.toFormat({ format: "peggy" }), '"\\ud800" [\\udc04-\\udfff] / [\\ud801-\\ud803] [\\udc00-\\udfff]' ); + r = ast.Range.create(16, 0x10004, 0x10bff, {}, false); + t.is( + r.toFormat({ format: "pest" }), + "'\\u{10004}'..'\\u{10bff}'" + ); r = ast.Range.create(16, 0, 0x10ffff, {}); t.is( r.toFormat({ format: "peggy" }), From a621fceb8f7f66a407e4903b3fad367dccb26e4e Mon Sep 17 00:00:00 2001 From: Joe Hildebrand Date: Fri, 3 Jan 2025 17:31:26 -0700 Subject: [PATCH 4/5] Clean ups --- lib/ast.js | 79 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/lib/ast.js b/lib/ast.js index 3cdf4b1..4080b0d 100644 --- a/lib/ast.js +++ b/lib/ast.js @@ -25,6 +25,15 @@ const PEST_DELIMS = { rangeSep: "..", }; +const BEFORE_SURROGATES = 0xd7ff; +const FIRST_HIGH = 0xd800; +const FIRST_LOW = 0xdc00; +const LAST_LOW = 0xdfff; +const AFTER_SURROGATES = 0xe000; +const LAST_BMP = 0xffff; +const FIRST_ASTRAL = 0x10000; +const SURROGATE_PAGE_SIZE = 0x400; + /** * @typedef {object} FormatOptions * @prop {string} [format='peggy'] @@ -99,6 +108,13 @@ function delims(opts) { return badFormat(opts); } +function surrogates(codePoint) { + return [ + Math.floor((codePoint - FIRST_ASTRAL) / SURROGATE_PAGE_SIZE) + FIRST_HIGH, + ((codePoint - FIRST_ASTRAL) % SURROGATE_PAGE_SIZE) + FIRST_LOW, + ]; +} + // Only exported for testing export class Base { constructor(type, loc, simple = true) { @@ -323,11 +339,12 @@ export class Range extends Base { if (first === last) { return new CaseSensitiveString(String.fromCodePoint(first), base, loc); } - if (utf16) { - if (last < 0xd800) { - return new Range(base, first, last, loc); - } - if ((first > 0xd7ff) && (first < 0xe000) && (last < 0xe000)) { + if (utf16 && (last > BEFORE_SURROGATES)) { + if ( + (first > BEFORE_SURROGATES) + && (first < AFTER_SURROGATES) + && (last < AFTER_SURROGATES) + ) { throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`); } @@ -337,19 +354,19 @@ export class Range extends Base { // - 0x10000-last const alts = []; - if (first < 0xd800) { - alts.push(Range.create(base, first, 0xd7ff, loc, false)); + if (first < FIRST_HIGH) { + alts.push(Range.create(base, first, BEFORE_SURROGATES, loc, false)); } - if (last < 0x10000) { - alts.push( - Range.create(base, Math.max(0xe000, first), last, loc, false) - ); + if (last < FIRST_ASTRAL) { + alts.push(Range.create( + base, Math.max(AFTER_SURROGATES, first), last, loc, false + )); } else { - if (first < 0x10000) { - alts.push( - Range.create(base, Math.max(0xe000, first), 0xffff, loc, false) - ); - first = 0x10000; + if (first < FIRST_ASTRAL) { + alts.push(Range.create( + base, Math.max(AFTER_SURROGATES, first), LAST_BMP, loc, false + )); + first = FIRST_ASTRAL; } // This code follows the logic in regenerate: @@ -359,14 +376,12 @@ export class Range extends Base { // b) Regenerate will only generate a string, which I then would have // to parse to get the info I needed out. // I believe this use is within the spirit of the MIT license. - const firstH = Math.floor((first - 0x10000) / 0x400) + 0xd800; - const firstL = ((first - 0x10000) % 0x400) + 0xdc00; - const lastH = Math.floor((last - 0x10000) / 0x400) + 0xd800; - const lastL = ((last - 0x10000) % 0x400) + 0xdc00; + const [firstH, firstL] = surrogates(first); + const [lastH, lastL] = surrogates(last); let complete = false; if ( (firstH === lastH) - || ((firstL === 0xdc00) && (lastL === 0xdfff)) + || ((firstL === FIRST_LOW) && (lastL === LAST_LOW)) ) { alts.push(new Concatenation([ Range.create(base, firstH, lastH, loc, false), @@ -375,35 +390,33 @@ export class Range extends Base { complete = true; } else { alts.push(new Concatenation([ - Range.create(base, firstH, firstH, loc, false), - Range.create(base, firstL, 0xdfff, loc, false), + new CaseSensitiveString(String.fromCodePoint(firstH), base, loc), + Range.create(base, firstL, LAST_LOW, loc, false), ], loc)); } if (!complete && (firstH + 1 < lastH)) { - if (lastL === 0xdfff) { + if (lastL === LAST_LOW) { alts.push(new Concatenation([ Range.create(base, firstH + 1, lastH + 1, loc, false), - Range.create(base, 0xdc00, lastL, loc, false), + Range.create(base, FIRST_LOW, lastL, loc, false), ], loc)); complete = true; } else { alts.push(new Concatenation([ Range.create(base, firstH + 1, lastH - 1, loc, false), - Range.create(base, 0xdc00, 0xdfff, loc, false), + Range.create(base, FIRST_LOW, LAST_LOW, loc, false), ], loc)); } } if (!complete) { alts.push(new Concatenation([ - Range.create(base, lastH, lastH, loc, false), - Range.create(base, 0xdc00, lastL, loc, false), + new CaseSensitiveString(String.fromCodePoint(lastH), base, loc), + Range.create(base, FIRST_LOW, lastL, loc, false), ], loc)); } } - if (alts.length === 1) { - return alts[0]; - } - return new Alternation(alts, loc); + assert(alts.length > 0); + return (alts.length === 1) ? alts[0] : new Alternation(alts, loc); } return new Range(base, first, last, loc); } @@ -412,7 +425,7 @@ export class Range extends Base { if (opts.format === "peggy") { if (num <= 0xff) { return "\\x" + num.toString(16).padStart(2, 0); - } else if (num <= 0xffff) { + } else if (num <= LAST_BMP) { return "\\u" + num.toString(16).padStart(4, 0); } else { throw new Error(`0x${num.toString(16)} does not fit in UTF-16`); From 15f14025715a044ad1d664fd00c79efd16d19bc8 Mon Sep 17 00:00:00 2001 From: Joe Hildebrand Date: Sat, 4 Jan 2025 13:55:41 -0700 Subject: [PATCH 5/5] One more refactor to make range splitting more clear and concise --- lib/ast.js | 148 +++++++++++++++++++++++++++++------------------------ 1 file changed, 82 insertions(+), 66 deletions(-) diff --git a/lib/ast.js b/lib/ast.js index 4080b0d..84ec2fa 100644 --- a/lib/ast.js +++ b/lib/ast.js @@ -32,7 +32,14 @@ const LAST_LOW = 0xdfff; const AFTER_SURROGATES = 0xe000; const LAST_BMP = 0xffff; const FIRST_ASTRAL = 0x10000; +const LAST_ASTRAL = 0x10ffff; const SURROGATE_PAGE_SIZE = 0x400; +const UTF16_RANGES = [ + [0, BEFORE_SURROGATES], + // [FIRST_HIGH, LAST_LOW], // Ignore this range + [AFTER_SURROGATES, LAST_BMP], + [FIRST_ASTRAL, LAST_ASTRAL], +]; /** * @typedef {object} FormatOptions @@ -115,6 +122,27 @@ function surrogates(codePoint) { ]; } +/** + * Partition the inclusive range [first, last] into chunks that are in one of + * the ranges specified in parts. If you want to ignore a range, do not + * include it in parts. + * + * @param {number} first + * @param {number} last + * @param {[start: number, end: number][]} parts Ranges to check. The start + * and end points are both inclusive. + * @returns {[start: number, end: number][]} + */ +function partitionRange(first, last, parts) { + const res = []; + for (const [start, end] of parts) { + if ((first <= end) && (last >= start)) { // Overlap with range? + res.push([Math.max(first, start), Math.min(last, end)]); + } + } + return res; +} + // Only exported for testing export class Base { constructor(type, loc, simple = true) { @@ -340,83 +368,71 @@ export class Range extends Base { return new CaseSensitiveString(String.fromCodePoint(first), base, loc); } if (utf16 && (last > BEFORE_SURROGATES)) { - if ( - (first > BEFORE_SURROGATES) - && (first < AFTER_SURROGATES) - && (last < AFTER_SURROGATES) - ) { - throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`); - } - - // Remove the range 0xd800-0xdfff. These possible ranges remain: - // - first-0xd7ff - // - 0xe000-0xffff - // - 0x10000-last const alts = []; - - if (first < FIRST_HIGH) { - alts.push(Range.create(base, first, BEFORE_SURROGATES, loc, false)); - } - if (last < FIRST_ASTRAL) { - alts.push(Range.create( - base, Math.max(AFTER_SURROGATES, first), last, loc, false - )); - } else { - if (first < FIRST_ASTRAL) { - alts.push(Range.create( - base, Math.max(AFTER_SURROGATES, first), LAST_BMP, loc, false - )); - first = FIRST_ASTRAL; - } - - // This code follows the logic in regenerate: - // https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996 - // I didn't use regenerate directly because: - // a) I only needed a small part of it - // b) Regenerate will only generate a string, which I then would have - // to parse to get the info I needed out. - // I believe this use is within the spirit of the MIT license. - const [firstH, firstL] = surrogates(first); - const [lastH, lastL] = surrogates(last); - let complete = false; - if ( - (firstH === lastH) - || ((firstL === FIRST_LOW) && (lastL === LAST_LOW)) - ) { - alts.push(new Concatenation([ - Range.create(base, firstH, lastH, loc, false), - Range.create(base, firstL, lastL, loc, false), - ], loc)); - complete = true; + for (const [start, end] of partitionRange(first, last, UTF16_RANGES)) { + if (start < FIRST_ASTRAL) { + // No surrogates needed + alts.push(Range.create(base, start, end, loc, false)); } else { - alts.push(new Concatenation([ - new CaseSensitiveString(String.fromCodePoint(firstH), base, loc), - Range.create(base, firstL, LAST_LOW, loc, false), - ], loc)); - } - if (!complete && (firstH + 1 < lastH)) { - if (lastL === LAST_LOW) { + // Pure astral range. + + // This code follows the logic in regenerate: + // https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996 + // I didn't use regenerate directly because: + // a) I only needed a small part of it + // b) Regenerate will only generate a string, which I then would have + // to parse to get the info I needed out. + // I believe this use is within the spirit of the MIT license. + const [startH, startL] = surrogates(start); + const [endH, endL] = surrogates(end); + let complete = false; + if ( + (startH === endH) || ((startL === FIRST_LOW) && (endL === LAST_LOW)) + ) { alts.push(new Concatenation([ - Range.create(base, firstH + 1, lastH + 1, loc, false), - Range.create(base, FIRST_LOW, lastL, loc, false), + Range.create(base, startH, endH, loc, false), + Range.create(base, startL, endL, loc, false), ], loc)); complete = true; } else { + // First part of range, where startL might be greater than FIRST_LOW + // May one day be combined with below if startL === FIRST_LOW alts.push(new Concatenation([ - Range.create(base, firstH + 1, lastH - 1, loc, false), - Range.create(base, FIRST_LOW, LAST_LOW, loc, false), + new CaseSensitiveString(String.fromCodePoint(startH), base, loc), + Range.create(base, startL, LAST_LOW, loc, false), + ], loc)); + } + if (!complete && (startH + 1 < endH)) { + if (endL === LAST_LOW) { + alts.push(new Concatenation([ + Range.create(base, startH + 1, endH + 1, loc, false), + Range.create(base, FIRST_LOW, endL, loc, false), + ], loc)); + complete = true; + } else { + alts.push(new Concatenation([ + Range.create(base, startH + 1, endH - 1, loc, false), + Range.create(base, FIRST_LOW, LAST_LOW, loc, false), + ], loc)); + } + } + if (!complete) { + alts.push(new Concatenation([ + new CaseSensitiveString(String.fromCodePoint(endH), base, loc), + Range.create(base, FIRST_LOW, endL, loc, false), ], loc)); } } - if (!complete) { - alts.push(new Concatenation([ - new CaseSensitiveString(String.fromCodePoint(lastH), base, loc), - Range.create(base, FIRST_LOW, lastL, loc, false), - ], loc)); - } } - assert(alts.length > 0); - return (alts.length === 1) ? alts[0] : new Alternation(alts, loc); + + switch (alts.length) { + case 0: + throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`); + case 1: + return alts[0]; + default: + return new Alternation(alts, loc); + } } return new Range(base, first, last, loc); }