Skip to content

Commit

Permalink
Correctly handle unicode ranges
Browse files Browse the repository at this point in the history
This should correctly generate UTF-16 ranges for
Peggy from any valid ABNF range.
Fixes #25.
  • Loading branch information
hildjj committed Jan 3, 2025
1 parent 32d9245 commit e8d03bc
Show file tree
Hide file tree
Showing 6 changed files with 255 additions and 148 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ examples/*.peggy
examples/*.pest
node_modules
t.abnf
.cert/
2 changes: 1 addition & 1 deletion bin/abnf_gen.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,6 @@ program
})
.parseAsync()
.catch(er => {
console.error(er.message);
console.error(er);
process.exit(1);
});
119 changes: 103 additions & 16 deletions lib/ast.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,28 @@ function slug(s) {
return s.replace(/-/g, "_");
}

function str(s) {
return `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{
function str(s, opts) {
s = `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{
"\r": "r",
"\n": "n",
'"': '"',
"\t": "t",
"\v": "x0B",
"\\": "\\",
}[c] || `x${c.charCodeAt(0).toString(16).padStart(2, "0")}`}`)}"`;

if (opts?.format === "peggy") {
s = s.replace(
/[\u0100-\uffff]/,
c => `\\u${c.codePointAt(0).toString(16).padStart(4, 0)}`
);
s = s.replace(
/[\u{10000}-\u{10ffff}]/u,
c => `\\u{${c.codePointAt(0).toString(16)}}`
);
}

return s;
}

function fromArray(opts, a, joiner, needed, parent) {
Expand Down Expand Up @@ -132,7 +145,7 @@ export class CaseInsensitiveString extends Base {
}

toFormat(opts) {
const s = str(this.str);
const s = str(this.str, opts);
if (this.str.match(/[a-z]/i)) {
switch (opts.format) {
case "peggy":
Expand All @@ -155,8 +168,8 @@ export class CaseSensitiveString extends Base {
this.base = base;
}

toFormat(_opts) {
return str(this.str);
toFormat(opts) {
return str(this.str, opts);
}
}

Expand Down Expand Up @@ -299,21 +312,92 @@ export class Range extends Base {
}

static create(base, first, last, loc, utf16 = true) {
if (utf16 && (first <= 0xffff) && (last === 0x10ffff)) {
// Special case "all high Unicode" since it shows up a lot
// This should be generalized.
if (first > last) {
throw new Error(`Range out of order ${first.toString(16)}-${last.toString(16)}`);
}
if (first === last) {
return new CaseSensitiveString(String.fromCodePoint(first), base, loc);
}
if (utf16) {
if (last < 0xd800) {
return new Range(base, first, last, loc);
}
if ((first > 0xd7ff) && (first < 0xe000) && (last < 0xe000)) {
throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`);
}

// Remove the range 0xd800-0xdfff. These possible ranges remain:
// - first-0xd7ff
// - 0xe000-0xffff
// - 0x10000-last
const alts = [];

if (first < 0xd800) {
alts.push(new Range(base, first, 0xd7ff, loc));
alts.push(new Range(base, 0xe000, 0xffff, loc));
} else {
alts.push(new Range(base, first, 0xffff, loc));
alts.push(Range.create(base, first, 0xd7ff, loc, false));
}
alts.push(new Concatenation([
new Range(base, 0xd800, 0xdbff, loc),
new Range(base, 0xdc00, 0xdfff, loc),
], loc));
if (last < 0x10000) {
alts.push(
Range.create(base, Math.max(0xe000, first), last, loc, false)
);
} else {
if (first < 0x10000) {
alts.push(
Range.create(base, Math.max(0xe000, first), 0xffff, loc, false)
);
first = 0x10000;
}

// This code follows the logic in regenerate:
// https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996
// I didn't use regenerate directly because:
// a) I only needed a small part of it
// b) Regenerate will only generate a string, which I then would have
// to parse to get the info I needed out.
// I believe this use is within the spirit of the MIT license.
const firstH = Math.floor((first - 0x10000) / 0x400) + 0xd800;
const firstL = ((first - 0x10000) % 0x400) + 0xdc00;
const lastH = Math.floor((last - 0x10000) / 0x400) + 0xd800;
const lastL = ((last - 0x10000) % 0x400) + 0xdc00;
let complete = false;
if (
(firstH === lastH)
|| ((firstL === 0xdc00) && (lastL === 0xdfff))
) {
alts.push(new Concatenation([
Range.create(base, firstH, lastH, loc, false),
Range.create(base, firstL, lastL, loc, false),
], loc));
complete = true;
} else {
alts.push(new Concatenation([
Range.create(base, firstH, firstH, loc, false),
Range.create(base, firstL, 0xdfff, loc, false),
], loc));
}
if (!complete && (firstH + 1 < lastH)) {
if (lastL === 0xdfff) {
alts.push(new Concatenation([
Range.create(base, firstH + 1, lastH + 1, loc, false),
Range.create(base, 0xdc00, lastL, loc, false),
], loc));
complete = true;
} else {
alts.push(new Concatenation([
Range.create(base, firstH + 1, lastH - 1, loc, false),
Range.create(base, 0xdc00, 0xdfff, loc, false),
], loc));
}
}
if (!complete) {
alts.push(new Concatenation([
Range.create(base, lastH, lastH, loc, false),
Range.create(base, 0xdc00, lastL, loc, false),
], loc));
}
}
if (alts.length === 1) {
return alts[0];
}
return new Alternation(alts, loc);
}
return new Range(base, first, last, loc);
Expand All @@ -339,6 +423,9 @@ export class Range extends Base {
const { rangeBefore, rangeAfter, rangeSep } = delims(opts);
const first = Range.escape(opts, this.first);
const last = Range.escape(opts, this.last);
if (this.first + 1 === this.last) {
return `${rangeBefore}${first}${last}${rangeAfter}`;
}
return `${rangeBefore}${first}${rangeSep}${last}${rangeAfter}`;
}
}
Expand Down
18 changes: 6 additions & 12 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,26 +50,20 @@
"ci": "npm run coverage && npm run lint"
},
"dependencies": {
"commander": "^12.1.0",
"commander": "^13.0.0",
"peggy": "^4.2.0"
},
"devDependencies": {
"@peggyjs/coverage": "1.3.2",
"@peggyjs/eslint-config": "^5.0.1",
"@typescript-eslint/eslint-plugin": "^8.18.0",
"@typescript-eslint/parser": "^8.18.0",
"@peggyjs/eslint-config": "^5.0.3",
"@typescript-eslint/eslint-plugin": "^8.19.0",
"@typescript-eslint/parser": "^8.19.0",
"ava": "6.2.0",
"c8": "10.1.3",
"eslint": "^9.16.0",
"eslint": "^9.17.0",
"typescript": "^5.7.2"
},
"packageManager": "pnpm@9.15.0",
"pnpm": {
"overrides": {
"cross-spawn": "^7.0.6",
"@eslint/plugin-kit": "^0.2.4"
}
},
"packageManager": "pnpm@9.15.2",
"engines": {
"node": ">=18"
}
Expand Down
Loading

0 comments on commit e8d03bc

Please sign in to comment.