Skip to content

Commit

Permalink
Merge pull request #26 from hildjj/fix-25
Browse files Browse the repository at this point in the history
Correctly handle unicode ranges
  • Loading branch information
hildjj authored Jan 4, 2025
2 parents 32d9245 + 15f1402 commit 2b18852
Show file tree
Hide file tree
Showing 6 changed files with 307 additions and 151 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ examples/*.peggy
examples/*.pest
node_modules
t.abnf
.cert/
2 changes: 1 addition & 1 deletion bin/abnf_gen.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,6 @@ program
})
.parseAsync()
.catch(er => {
console.error(er.message);
console.error(er);
process.exit(1);
});
159 changes: 140 additions & 19 deletions lib/ast.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,22 @@ const PEST_DELIMS = {
rangeSep: "..",
};

const BEFORE_SURROGATES = 0xd7ff;
const FIRST_HIGH = 0xd800;
const FIRST_LOW = 0xdc00;
const LAST_LOW = 0xdfff;
const AFTER_SURROGATES = 0xe000;
const LAST_BMP = 0xffff;
const FIRST_ASTRAL = 0x10000;
const LAST_ASTRAL = 0x10ffff;
const SURROGATE_PAGE_SIZE = 0x400;
const UTF16_RANGES = [
[0, BEFORE_SURROGATES],
// [FIRST_HIGH, LAST_LOW], // Ignore this range
[AFTER_SURROGATES, LAST_BMP],
[FIRST_ASTRAL, LAST_ASTRAL],
];

/**
* @typedef {object} FormatOptions
* @prop {string} [format='peggy']
Expand All @@ -42,15 +58,33 @@ function slug(s) {
return s.replace(/-/g, "_");
}

function str(s) {
return `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{
function str(s, opts) {
s = `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{
"\r": "r",
"\n": "n",
'"': '"',
"\t": "t",
"\v": "x0B",
"\\": "\\",
}[c] || `x${c.charCodeAt(0).toString(16).padStart(2, "0")}`}`)}"`;

if (opts?.format === "peggy") {
s = s.replace(
/[\u0100-\uffff]/,
c => `\\u${c.codePointAt(0).toString(16).padStart(4, 0)}`
);
s = s.replace(
/[\u{10000}-\u{10ffff}]/u,
c => `\\u{${c.codePointAt(0).toString(16)}}`
);
} else if (opts?.format === "pest") {
s = s.replace(
/[\u{ff}-\u{10ffff}]/u,
c => `\\u{${c.codePointAt(0).toString(16)}}`
);
}

return s;
}

function fromArray(opts, a, joiner, needed, parent) {
Expand Down Expand Up @@ -81,6 +115,34 @@ function delims(opts) {
return badFormat(opts);
}

function surrogates(codePoint) {
return [
Math.floor((codePoint - FIRST_ASTRAL) / SURROGATE_PAGE_SIZE) + FIRST_HIGH,
((codePoint - FIRST_ASTRAL) % SURROGATE_PAGE_SIZE) + FIRST_LOW,
];
}

/**
* Partition the inclusive range [first, last] into chunks that are in one of
* the ranges specified in parts. If you want to ignore a range, do not
* include it in parts.
*
* @param {number} first
* @param {number} last
* @param {[start: number, end: number][]} parts Ranges to check. The start
* and end points are both inclusive.
* @returns {[start: number, end: number][]}
*/
function partitionRange(first, last, parts) {
const res = [];
for (const [start, end] of parts) {
if ((first <= end) && (last >= start)) { // Overlap with range?
res.push([Math.max(first, start), Math.min(last, end)]);
}
}
return res;
}

// Only exported for testing
export class Base {
constructor(type, loc, simple = true) {
Expand Down Expand Up @@ -132,7 +194,7 @@ export class CaseInsensitiveString extends Base {
}

toFormat(opts) {
const s = str(this.str);
const s = str(this.str, opts);
if (this.str.match(/[a-z]/i)) {
switch (opts.format) {
case "peggy":
Expand All @@ -155,8 +217,8 @@ export class CaseSensitiveString extends Base {
this.base = base;
}

toFormat(_opts) {
return str(this.str);
toFormat(opts) {
return str(this.str, opts);
}
}

Expand Down Expand Up @@ -299,22 +361,78 @@ export class Range extends Base {
}

static create(base, first, last, loc, utf16 = true) {
if (utf16 && (first <= 0xffff) && (last === 0x10ffff)) {
// Special case "all high Unicode" since it shows up a lot
// This should be generalized.
if (first > last) {
throw new Error(`Range out of order ${first.toString(16)}-${last.toString(16)}`);
}
if (first === last) {
return new CaseSensitiveString(String.fromCodePoint(first), base, loc);
}
if (utf16 && (last > BEFORE_SURROGATES)) {
const alts = [];
if (first < 0xd800) {
alts.push(new Range(base, first, 0xd7ff, loc));
alts.push(new Range(base, 0xe000, 0xffff, loc));
} else {
alts.push(new Range(base, first, 0xffff, loc));
for (const [start, end] of partitionRange(first, last, UTF16_RANGES)) {
if (start < FIRST_ASTRAL) {
// No surrogates needed
alts.push(Range.create(base, start, end, loc, false));
} else {
// Pure astral range.

// This code follows the logic in regenerate:
// https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996
// I didn't use regenerate directly because:
// a) I only needed a small part of it
// b) Regenerate will only generate a string, which I then would have
// to parse to get the info I needed out.
// I believe this use is within the spirit of the MIT license.
const [startH, startL] = surrogates(start);
const [endH, endL] = surrogates(end);
let complete = false;
if (
(startH === endH) || ((startL === FIRST_LOW) && (endL === LAST_LOW))
) {
alts.push(new Concatenation([
Range.create(base, startH, endH, loc, false),
Range.create(base, startL, endL, loc, false),
], loc));
complete = true;
} else {
// First part of range, where startL might be greater than FIRST_LOW
// May one day be combined with below if startL === FIRST_LOW
alts.push(new Concatenation([
new CaseSensitiveString(String.fromCodePoint(startH), base, loc),
Range.create(base, startL, LAST_LOW, loc, false),
], loc));
}
if (!complete && (startH + 1 < endH)) {
if (endL === LAST_LOW) {
alts.push(new Concatenation([
Range.create(base, startH + 1, endH + 1, loc, false),
Range.create(base, FIRST_LOW, endL, loc, false),
], loc));
complete = true;
} else {
alts.push(new Concatenation([
Range.create(base, startH + 1, endH - 1, loc, false),
Range.create(base, FIRST_LOW, LAST_LOW, loc, false),
], loc));
}
}
if (!complete) {
alts.push(new Concatenation([
new CaseSensitiveString(String.fromCodePoint(endH), base, loc),
Range.create(base, FIRST_LOW, endL, loc, false),
], loc));
}
}
}
alts.push(new Concatenation([
new Range(base, 0xd800, 0xdbff, loc),
new Range(base, 0xdc00, 0xdfff, loc),
], loc));

return new Alternation(alts, loc);
switch (alts.length) {
case 0:
throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`);
case 1:
return alts[0];
default:
return new Alternation(alts, loc);
}
}
return new Range(base, first, last, loc);
}
Expand All @@ -323,7 +441,7 @@ export class Range extends Base {
if (opts.format === "peggy") {
if (num <= 0xff) {
return "\\x" + num.toString(16).padStart(2, 0);
} else if (num <= 0xffff) {
} else if (num <= LAST_BMP) {
return "\\u" + num.toString(16).padStart(4, 0);
} else {
throw new Error(`0x${num.toString(16)} does not fit in UTF-16`);
Expand All @@ -339,6 +457,9 @@ export class Range extends Base {
const { rangeBefore, rangeAfter, rangeSep } = delims(opts);
const first = Range.escape(opts, this.first);
const last = Range.escape(opts, this.last);
if ((opts.format === "peggy") && (this.first + 1 === this.last)) {
return `${rangeBefore}${first}${last}${rangeAfter}`;
}
return `${rangeBefore}${first}${rangeSep}${last}${rangeAfter}`;
}
}
Expand Down
18 changes: 6 additions & 12 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,26 +50,20 @@
"ci": "npm run coverage && npm run lint"
},
"dependencies": {
"commander": "^12.1.0",
"commander": "^13.0.0",
"peggy": "^4.2.0"
},
"devDependencies": {
"@peggyjs/coverage": "1.3.2",
"@peggyjs/eslint-config": "^5.0.1",
"@typescript-eslint/eslint-plugin": "^8.18.0",
"@typescript-eslint/parser": "^8.18.0",
"@peggyjs/eslint-config": "^5.0.3",
"@typescript-eslint/eslint-plugin": "^8.19.0",
"@typescript-eslint/parser": "^8.19.0",
"ava": "6.2.0",
"c8": "10.1.3",
"eslint": "^9.16.0",
"eslint": "^9.17.0",
"typescript": "^5.7.2"
},
"packageManager": "pnpm@9.15.0",
"pnpm": {
"overrides": {
"cross-spawn": "^7.0.6",
"@eslint/plugin-kit": "^0.2.4"
}
},
"packageManager": "pnpm@9.15.2",
"engines": {
"node": ">=18"
}
Expand Down
Loading

0 comments on commit 2b18852

Please sign in to comment.