Skip to content

Commit

Permalink
Use JSON11 to parse and serialize long numerals
Browse files Browse the repository at this point in the history
Signed-off-by: Miki <miki@amazon.com>
  • Loading branch information
AMoo-Miki committed Jun 3, 2024
1 parent f7e2f81 commit b7f1b51
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 210 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Bumps `rimraf` from 5.0.5 to 5.0.7
- Bumps `aws4` from 1.12.0 to 1.13.0
### Changed
- Upgraded the parsing and serialization of long numerals to employ JSON11 ([784](https://github.com/opensearch-project/opensearch-js/pull/784)).
### Deprecated
### Removed
### Fixed
Expand Down
241 changes: 36 additions & 205 deletions lib/Serializer.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,60 +34,9 @@ const debug = require('debug')('opensearch');
const sjson = require('secure-json-parse');
const { SerializationError, DeserializationError } = require('./errors');
const kJsonOptions = Symbol('secure json parse options');
const JSON11 = require('json11');

/* In JavaScript, a `Number` is a 64-bit floating-point value which can store 16 digits. However, the
* serializer and deserializer will need to cater to numeric values generated by other languages which
* can have up to 19 digits. Native JSON parser and stringifier, incapable of handling the extra
* digits, corrupt the values, making them unusable.
*
* To work around this limitation, the deserializer converts long sequences of digits into strings and
* marks them before applying the parser. During the parsing, string values that begin with the mark
* are converted to `BigInt` values.
* Similarly, during stringification, the serializer converts `BigInt` values to marked strings and
* when done, it replaces them with plain numerals.
*
* `Number.MAX_SAFE_INTEGER`, 9,007,199,254,740,991, is the largest number that the native methods can
* parse and stringify, and any numeral greater than that would need to be translated using the
* workaround; all 17-digits or longer and only tail-end of the 16-digits need translation. It would
* be unfair to all the 16-digit numbers if the translation applied to `\d{16,}` only to cover the
* less than 10%. Hence, a RegExp is created to only match numerals too long to be a number.
*
* To make the explanation simpler, let's assume that MAX_SAFE_INTEGER is 8921 which has 4 digits.
* Starting from the right, we take each digit onwards, `[<start>-9]`:
* 1) 7922 - 7929: 792[2-9]\d{0}
* 2) 7930 - 7999: 79[3-9]\d{1}
* 9) 9 + 1 = 10 which results in a rollover; no need to do anything.
* 8) 9000 - 9999: [9-9]\d{3}
* Finally we add anything 5 digits or longer: `\d{5,}
*
* PS, a better solution would use AST but considering its performance penalty, RegExp is the next
* the best solution.
*/
const isBigIntSupported = typeof BigInt !== 'undefined';
const maxIntAsString = String(Number.MAX_SAFE_INTEGER);
const maxIntLength = maxIntAsString.length;
// Sub-patterns for each digit
const bigIntMatcherTokens = [`\\d{${maxIntAsString.length + 1},}`];
for (let i = 0; i < maxIntLength; i++) {
if (maxIntAsString[i] !== '9') {
bigIntMatcherTokens.push(
maxIntAsString.substring(0, i) +
`[${parseInt(maxIntAsString[i], 10) + 1}-9]` +
`\\d{${maxIntLength - i - 1}}`
);
}
}

/* The matcher that looks for `": <numerals>, ...}` and `[..., <numeral>, ...]`
*
* The pattern starts by looking for `":` not immediately preceded by a `\`. That should be
* followed by any of the numeric sub-patterns. A comma, end of an array, end of an object, or
* the end of the input are the only acceptable elements after it.
*/
const bigIntMatcher = new RegExp(
`((?:\\[|,|(?<!\\\\)"\\s*:)\\s*)(-?(?:${bigIntMatcherTokens.join('|')}))(\\s*)(?=,|}|]|$)`,
'g'
);

class Serializer {
constructor(opts = {}) {
Expand All @@ -99,154 +48,6 @@ class Serializer {
};
}

/* The characters with a highly unlikely chance of occurrence in strings, alone or in combination.
*
* ToDo: When support for ancient versions of Node.js are dropped, replace with
* _bigIntMarkChars = ['෴', '߷', '֍'];
*/
get _bigIntMarkChars() {
return ['෴', '߷', '֍'];
}

/* Generates an array of all combinations of `_bigIntMarkChars` with the requested length. */
_bigIntMarkerCombinations(length = 3) {
const results = [];
const arr = this._bigIntMarkChars;
const arrLength = arr.length;
const temp = Array(length);

(function fill(pos, start) {
if (pos === length) return results.push(temp.join(''));

for (let i = start; i < arrLength; i++) {
temp[pos] = arr[i];
fill(pos + 1, i);
}
})(0, 0);

return results;
}

/* Experiments with different combinations of various lengths, until one is found to not be in
* the input string.
*/
_getSuitableBigIntMarker(json) {
let bigIntMarker;
let length = 0;
do {
length++;
this._bigIntMarkerCombinations(length).some((marker) => {
if (json.indexOf(marker) === -1) {
bigIntMarker = marker;
return true;
}
});
} while (!bigIntMarker);

return {
bigIntMarker,
length,
};
}

_parseWithBigInt(json) {
const { bigIntMarker, length } = this._getSuitableBigIntMarker(json);

let hadException;
let markedJSON = json.replace(bigIntMatcher, `$1"${bigIntMarker}$2"$3`);

/* RegExp cannot replace AST and the process of marking adds quotes. So, any false-positive hit
* will make the JSON string unparseable.
*
* To find those instances, we try to parse and watch for the location of any errors. If an error
* is caused by the marking, we remove that single marking and try again.
*/
do {
try {
hadException = false;
JSON.parse(markedJSON);
} catch (e) {
hadException = true;
/* There are two types of exception objects that can be raised:
* 1) a proper object with lineNumber and columnNumber which we can use
* 2) a textual message with the position that we need to parse
*/
let { lineNumber, columnNumber } = e;
if (!lineNumber || !columnNumber) {
const match =
// ToDo: When support for ancient versions of Node.js are dropped, replace with
// e?.message?.match?.()
e &&
e.message &&
typeof e.message.match === 'function' &&
e.message.match(/^Unexpected token.*at position (\d+)$/);
if (match) {
lineNumber = 1;
// The position is zero-indexed; adding 1 to normalize it for the -2 that comes later
columnNumber = parseInt(match[1], 10) + 1;
}
}

if (lineNumber < 1 || columnNumber < 2) {
// The problem is not with this replacement; just return a failure.
return;
}

/* We need to skip e.lineNumber - 1 number of `\n` occurrences.
* Then, we need to go to e.columnNumber - 2 to look for `"<mark>\d+"`; we need to `-1` to
* account for the quote but an additional `-1` is needed because columnNumber starts from 1.
*/
const re = new RegExp(
`^((?:.*\\n){${lineNumber - 1}}[^\\n]{${columnNumber - 2}})"${bigIntMarker}(-?\\d+)"`
);
if (!re.test(markedJSON)) {
// The exception is not caused by adding the marker
return;
}

// We have found a bad replacement; let's remove it.
markedJSON = markedJSON.replace(re, '$1$2');
}
} while (hadException);

const bigIntMarkFinder = new RegExp(`^${bigIntMarker}-?\\d+$`);

// Exceptions will trickle up to the caller
return sjson.parse(
markedJSON,
(key, val) =>
/* Convert marked values to BigInt values.
* The `startsWith` is purely for performance, to avoid running `test` if not needed.
*/
typeof val === 'string' && val.startsWith(bigIntMarker) && bigIntMarkFinder.test(val)
? BigInt(val.substring(length)) // eslint-disable-line no-undef
: val,
this[kJsonOptions]
);
}

_stringifyWithBigInt(object, candidate) {
const { bigIntMarker } = this._getSuitableBigIntMarker(candidate);

/* The matcher that looks for "<marker><numerals>"
* Because we have made sure that `bigIntMarker` was never present in the original object, we can
* carelessly assume every "<marker><numerals>" is due to our marking.
*/
const markedBigIntMatcher = new RegExp(`"${bigIntMarker}(-?\\d+)"`, 'g');

return (
JSON.stringify(
object,
/* Convert BigInt values to a string and mark them.
* Can't be bothered with Number values beyond safe values because they are already corrupted.
*/
(key, val) => (typeof val === 'bigint' ? `${bigIntMarker}${val.toString()}` : val)
)
// Replace marked substrings with just the numerals
.replace(markedBigIntMatcher, '$1')
);
}

serialize(object) {
debug('Serializing', object);
let json;
Expand All @@ -262,11 +63,29 @@ class Serializer {
const shouldHandleLongNumerals =
isBigIntSupported && this[kJsonOptions].enableLongNumeralSupport;
try {
/* When handling long numerals is not requested or the platform doesn't support BigInt, the
* result of JSON.stringify are returned.
*
* When long numerals should be handled:
* Use JSON.stringify to check if any value is a BigInt:
* * If no BigInt values are found, the result of JSON.stringify is good enough to be returned.
* * Only If a BigInt value is found, JSON11.stringify is employed and its result is returned.
*/
json = JSON.stringify(object, shouldHandleLongNumerals ? checkForBigInts : null);

if (shouldHandleLongNumerals && !numeralsAreNumbers) {
const temp = this._stringifyWithBigInt(object, json);
if (temp) json = temp;
try {
// With `withBigInt: false`, valid JSON is produced while maintaining accuracy
const temp = JSON11.stringify(object, {
withBigInt: false,
quote: '"',
quoteNames: true,
});
if (temp) json = temp;
} catch (ex) {
// Do nothing: JSON.stringify succeeded but JSON11.stringify failed; return the
// JSON.stringify result.
}
}
} catch (err) {
throw new SerializationError(err.message, object);
Expand All @@ -292,16 +111,28 @@ class Serializer {
const shouldHandleLongNumerals =
isBigIntSupported && this[kJsonOptions].enableLongNumeralSupport;
try {
/* When handling long numerals is not requested or the platform doesn't support BigInt, the
* result of sjson.parse are returned.
*
* When long numerals should be handled:
* Use sjson.parse to check if any value is outside the range of safe integers:
* * If no long numerals are found, the result of sjson.parse is good enough to be returned.
* * Only If long numerals are found, JSON11.parse is employed and its result is returned.
*/
object = sjson.parse(
json,
shouldHandleLongNumerals ? checkForLargeNumerals : null,
this[kJsonOptions]
);

if (shouldHandleLongNumerals && !numeralsAreNumbers) {
const temp = this._parseWithBigInt(json);
if (temp) {
object = temp;
try {
const temp = JSON11.parse(json, null, { withLongNumerals: true });
if (temp) {
object = temp;
}
} catch (ex) {
// Do nothing: sjson.parse succeeded but JSON11.parse failed; return the sjson.parse result
}
}
} catch (err) {
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
"xmlbuilder2": "^3.0.2"
},
"dependencies": {
"json11": "^1.0.2",
"aws4": "^1.11.0",
"debug": "^4.3.1",
"hpagent": "^1.2.0",
Expand Down
6 changes: 3 additions & 3 deletions test/fixtures/longnumerals-dataset.ndjson
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{"number":18014398509481982,"description":"-18014398509481982 , -1 , 1 , 18014398509481982"}
{"number":-18014398509481982,"description":"෴߷֍෴18014398509481982"}
{"description":"[\"෴߷֍෴18014398509481982\"]", "number":18014398509481982}
{"number":-18014398509481982,"description":"18014398509481982,1,-1,-18014398509481982"}
{"description":"[\"18014398509481982\"]", "number":18014398509481982}
{"description":"18014398509481982", "number":18014398509481982}
{"number":9007199254740891,"description":"Safer than [18014398509481982]"}
{"number":9007199254740891,"description":"Safer than [18014398509481982]"}
4 changes: 2 additions & 2 deletions test/integration/serializer/longnumerals.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ test('long numerals', async (t) => {
}
t.same(object, {
'-18014398509481982 , -1 , 1 , 18014398509481982': 18014398509481982n,
'෴߷֍෴18014398509481982': -18014398509481982n,
'18014398509481982,1,-1,-18014398509481982': -18014398509481982n,
'Safer than [18014398509481982]': 9007199254740891,
18014398509481982: 18014398509481982n,
'["෴߷֍෴18014398509481982"]': 18014398509481982n,
'["18014398509481982"]': 18014398509481982n,
});
});
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2224,6 +2224,11 @@ json-stable-stringify-without-jsonify@^1.0.1:
resolved "https://registry.yarnpkg.com/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz#9db7b59496ad3f3cfef30a75142d2d930ad72651"
integrity sha1-nbe1lJatPzz+8wp1FC0tkwrXJlE=

json11@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/json11/-/json11-1.0.2.tgz#98e6a3a01d3ed03d40c3ecc7e945767d23287b6f"
integrity sha512-XnGnLo/fE2gsyK+VSPZCOB0IQlJfQaDVIPd4A+KvK+1inMQbm1eEimu/1JzTZgZ0YO/n2WJcdJGDgPHtiQynjw==

json5@^2.1.2:
version "2.2.3"
resolved "https://registry.yarnpkg.com/json5/-/json5-2.2.3.tgz#78cd6f1a19bdc12b73db5ad0c61efd66c1e29283"
Expand Down

0 comments on commit b7f1b51

Please sign in to comment.