From 7a3f0ef888623813af37b1af223802f9057e6e33 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Mon, 19 Jun 2017 13:17:29 -0700 Subject: [PATCH 1/2] util: refactor util module * refactor util exports * early capture of prototype methods * use template strings and args consistently PR-URL: https://github.com/nodejs/node/pull/13803 Reviewed-By: Alexey Orlenko Reviewed-By: Colin Ihrig --- lib/util.js | 297 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 174 insertions(+), 123 deletions(-) diff --git a/lib/util.js b/lib/util.js index 99c7e5e9e056c5..897970d39b1829 100644 --- a/lib/util.js +++ b/lib/util.js @@ -21,13 +21,35 @@ 'use strict'; -const uv = process.binding('uv'); -const Buffer = require('buffer').Buffer; -const internalUtil = require('internal/util'); -const binding = process.binding('util'); const errors = require('internal/errors'); -const isError = internalUtil.isError; +const { errname } = process.binding('uv'); + +const { + getPromiseDetails, + getProxyDetails, + isAnyArrayBuffer, + isDataView, + isExternal, + isMap, + isMapIterator, + isPromise, + isSet, + isSetIterator, + isTypedArray, + isRegExp: _isRegExp, + isDate: _isDate, + kPending, + kRejected, +} = process.binding('util'); + +const { + customInspectSymbol, + deprecate, + getConstructorOf, + isError, + promisify +} = require('internal/util'); const inspectDefaultOptions = Object.seal({ showHidden: false, @@ -41,6 +63,12 @@ const inspectDefaultOptions = Object.seal({ const numbersOnlyRE = /^\d+$/; +const objectHasOwnProperty = Object.prototype.hasOwnProperty; +const propertyIsEnumerable = Object.prototype.propertyIsEnumerable; +const regExpToString = RegExp.prototype.toString; +const dateToISOString = Date.prototype.toISOString; +const errorToString = Error.prototype.toString; + var CIRCULAR_ERROR_MESSAGE; var Debug; @@ -62,7 +90,7 @@ function tryStringify(arg) { } } -exports.format = function(f) { +function format(f) { if (typeof f !== 'string') { const objects = new Array(arguments.length); for (var index = 0; index < arguments.length; index++) { @@ -132,26 +160,26 @@ exports.format = function(f) { while (a < arguments.length) { const x = arguments[a++]; if (x === null || (typeof x !== 'object' && typeof x !== 'symbol')) { - str += ' ' + x; + str += ` ${x}`; } else { - str += ' ' + inspect(x); + str += ` ${inspect(x)}`; } } return str; -}; - - -exports.deprecate = internalUtil.deprecate; +} var debugs = {}; var debugEnviron; -exports.debuglog = function(set) { - if (debugEnviron === undefined) - debugEnviron = process.env.NODE_DEBUG || ''; + +function debuglog(set) { + if (debugEnviron === undefined) { + debugEnviron = new Set( + (process.env.NODE_DEBUG || '').split(',').map((s) => s.toUpperCase())); + } set = set.toUpperCase(); if (!debugs[set]) { - if (new RegExp(`\\b${set}\\b`, 'i').test(debugEnviron)) { + if (debugEnviron.has(set)) { var pid = process.pid; debugs[set] = function() { var msg = exports.format.apply(exports, arguments); @@ -162,7 +190,7 @@ exports.debuglog = function(set) { } } return debugs[set]; -}; +} /** @@ -196,6 +224,7 @@ function inspect(obj, opts) { if (ctx.maxArrayLength === null) ctx.maxArrayLength = Infinity; return formatValue(ctx, obj, ctx.depth); } +inspect.custom = customInspectSymbol; Object.defineProperty(inspect, 'defaultOptions', { get: function() { @@ -241,11 +270,6 @@ inspect.styles = Object.assign(Object.create(null), { 'regexp': 'red' }); -const customInspectSymbol = internalUtil.customInspectSymbol; - -exports.inspect = inspect; -exports.inspect.custom = customInspectSymbol; - function stylizeWithColor(str, styleType) { var style = inspect.styles[styleType]; @@ -302,7 +326,7 @@ function formatValue(ctx, value, recurseTimes) { // Otherwise, it'll return an array. The first item // is the target, the second item is the handler. // We ignore (and do not return) the Proxy isRevoked property. - proxy = binding.getProxyDetails(value); + proxy = getProxyDetails(value); if (proxy) { // We know for a fact that this isn't a Proxy. // Mark it as having already been evaluated. @@ -320,7 +344,7 @@ function formatValue(ctx, value, recurseTimes) { proxyCache.set(value, proxy); } if (proxy) { - return 'Proxy ' + formatValue(ctx, proxy, recurseTimes); + return `Proxy ${formatValue(ctx, proxy, recurseTimes)}`; } } @@ -358,7 +382,7 @@ function formatValue(ctx, value, recurseTimes) { var visibleKeys = arrayToHash(keys); const symbolKeys = Object.getOwnPropertySymbols(value); const enumSymbolKeys = symbolKeys - .filter((key) => Object.prototype.propertyIsEnumerable.call(value, key)); + .filter((key) => propertyIsEnumerable.call(value, key)); keys = keys.concat(enumSymbolKeys); if (ctx.showHidden) { @@ -391,7 +415,7 @@ function formatValue(ctx, value, recurseTimes) { }); } - var constructor = internalUtil.getConstructorOf(value); + var constructor = getConstructorOf(value); // Some type of object without properties can be shortcutted. if (keys.length === 0) { @@ -401,13 +425,13 @@ function formatValue(ctx, value, recurseTimes) { `[${ctorName}${value.name ? `: ${value.name}` : ''}]`, 'special'); } if (isRegExp(value)) { - return ctx.stylize(RegExp.prototype.toString.call(value), 'regexp'); + return ctx.stylize(regExpToString.call(value), 'regexp'); } if (isDate(value)) { if (Number.isNaN(value.getTime())) { return ctx.stylize(value.toString(), 'date'); } else { - return ctx.stylize(Date.prototype.toISOString.call(value), 'date'); + return ctx.stylize(dateToISOString.call(value), 'date'); } } if (isError(value)) { @@ -433,11 +457,11 @@ function formatValue(ctx, value, recurseTimes) { // Fast path for ArrayBuffer and SharedArrayBuffer. // Can't do the same for DataView because it has a non-primitive // .buffer property that we need to recurse for. - if (binding.isAnyArrayBuffer(value)) { + if (isAnyArrayBuffer(value)) { return `${constructor.name}` + ` { byteLength: ${formatNumber(ctx, value.byteLength)} }`; } - if (binding.isExternal(value)) { + if (isExternal(value)) { return ctx.stylize('[External]', 'special'); } } @@ -459,7 +483,7 @@ function formatValue(ctx, value, recurseTimes) { braces = ['[', ']']; empty = value.length === 0; formatter = formatArray; - } else if (binding.isSet(value)) { + } else if (isSet(value)) { braces = ['{', '}']; // With `showHidden`, `length` will display as a hidden property for // arrays. For consistency's sake, do the same for `size`, even though this @@ -468,25 +492,25 @@ function formatValue(ctx, value, recurseTimes) { keys.unshift('size'); empty = value.size === 0; formatter = formatSet; - } else if (binding.isMap(value)) { + } else if (isMap(value)) { braces = ['{', '}']; // Ditto. if (ctx.showHidden) keys.unshift('size'); empty = value.size === 0; formatter = formatMap; - } else if (binding.isAnyArrayBuffer(value)) { + } else if (isAnyArrayBuffer(value)) { braces = ['{', '}']; keys.unshift('byteLength'); visibleKeys.byteLength = true; - } else if (binding.isDataView(value)) { + } else if (isDataView(value)) { braces = ['{', '}']; // .buffer goes last, it's not a primitive like the others. keys.unshift('byteLength', 'byteOffset', 'buffer'); visibleKeys.byteLength = true; visibleKeys.byteOffset = true; visibleKeys.buffer = true; - } else if (binding.isTypedArray(value)) { + } else if (isTypedArray(value)) { braces = ['[', ']']; formatter = formatTypedArray; if (ctx.showHidden) { @@ -497,15 +521,15 @@ function formatValue(ctx, value, recurseTimes) { 'byteOffset', 'buffer'); } - } else if (binding.isPromise(value)) { + } else if (isPromise(value)) { braces = ['{', '}']; formatter = formatPromise; - } else if (binding.isMapIterator(value)) { + } else if (isMapIterator(value)) { constructor = { name: 'MapIterator' }; braces = ['{', '}']; empty = false; formatter = formatCollectionIterator; - } else if (binding.isSetIterator(value)) { + } else if (isSetIterator(value)) { constructor = { name: 'SetIterator' }; braces = ['{', '}']; empty = false; @@ -528,17 +552,17 @@ function formatValue(ctx, value, recurseTimes) { // Make RegExps say that they are RegExps if (isRegExp(value)) { - base = ' ' + RegExp.prototype.toString.call(value); + base = ` ${regExpToString.call(value)}`; } // Make dates with properties first say the date if (isDate(value)) { - base = ' ' + Date.prototype.toISOString.call(value); + base = ` ${dateToISOString.call(value)}`; } // Make error with message first say the error if (isError(value)) { - base = ' ' + formatError(value); + base = ` ${formatError(value)}`; } // Make boxed primitive Strings look like such @@ -564,12 +588,12 @@ function formatValue(ctx, value, recurseTimes) { braces[0] = `${constructor.name} ${braces[0]}`; if (empty === true) { - return braces[0] + base + braces[1]; + return `${braces[0]}${base}${braces[1]}`; } if (recurseTimes < 0) { if (isRegExp(value)) { - return ctx.stylize(RegExp.prototype.toString.call(value), 'regexp'); + return ctx.stylize(regExpToString.call(value), 'regexp'); } else if (Array.isArray(value)) { return ctx.stylize('[Array]', 'special'); } else { @@ -591,7 +615,7 @@ function formatNumber(ctx, value) { // Format -0 as '-0'. Strict equality won't distinguish 0 from -0. if (Object.is(value, -0)) return ctx.stylize('-0', 'number'); - return ctx.stylize('' + value, 'number'); + return ctx.stylize(`${value}`, 'number'); } @@ -606,18 +630,16 @@ function formatPrimitive(ctx, value) { var type = typeof value; if (type === 'string') { - var simple = '\'' + - JSON.stringify(value) + var simple = JSON.stringify(value) .replace(/^"|"$/g, '') .replace(/'/g, "\\'") - .replace(/\\"/g, '"') + - '\''; - return ctx.stylize(simple, 'string'); + .replace(/\\"/g, '"'); + return ctx.stylize(`'${simple}'`, 'string'); } if (type === 'number') return formatNumber(ctx, value); if (type === 'boolean') - return ctx.stylize('' + value, 'boolean'); + return ctx.stylize(`${value}`, 'boolean'); // es6 symbol primitive if (type === 'symbol') return ctx.stylize(value.toString(), 'symbol'); @@ -634,7 +656,7 @@ function formatPrimitiveNoColor(ctx, value) { function formatError(value) { - return value.stack || `[${Error.prototype.toString.call(value)}]`; + return value.stack || `[${errorToString.call(value)}]`; } @@ -745,15 +767,15 @@ function formatCollectionIterator(ctx, value, recurseTimes, visibleKeys, keys) { function formatPromise(ctx, value, recurseTimes, visibleKeys, keys) { const output = []; - const [state, result] = binding.getPromiseDetails(value); + const [state, result] = getPromiseDetails(value); - if (state === binding.kPending) { + if (state === kPending) { output.push(''); } else { var nextRecurseTimes = recurseTimes === null ? null : recurseTimes - 1; var str = formatValue(ctx, result, nextRecurseTimes); - if (state === binding.kRejected) { - output.push(' ' + str); + if (state === kRejected) { + output.push(` ${str}`); } else { output.push(str); } @@ -809,7 +831,7 @@ function formatProperty(ctx, value, recurseTimes, visibleKeys, key, array) { if (array && numbersOnlyRE.test(key)) { return str; } - name = JSON.stringify('' + key); + name = JSON.stringify(`${key}`); if (/^"[a-zA-Z_][a-zA-Z_0-9]*"$/.test(name)) { name = name.substr(1, name.length - 2); name = ctx.stylize(name, 'name'); @@ -836,85 +858,64 @@ function reduceToSingleString(output, base, braces, breakLength) { // If the opening "brace" is too large, like in the case of "Set {", // we need to force the first item to be on the next line or the // items will not line up correctly. - (base === '' && braces[0].length === 1 ? '' : base + '\n ') + + (base === '' && braces[0].length === 1 ? '' : `${base}\n `) + ` ${output.join(',\n ')} ${braces[1]}`; } return `${braces[0]}${base} ${output.join(', ')} ${braces[1]}`; } - -// NOTE: These type checking functions intentionally don't use `instanceof` -// because it is fragile and can be easily faked with `Object.create()`. -exports.isArray = Array.isArray; - function isBoolean(arg) { return typeof arg === 'boolean'; } -exports.isBoolean = isBoolean; function isNull(arg) { return arg === null; } -exports.isNull = isNull; function isNullOrUndefined(arg) { return arg === null || arg === undefined; } -exports.isNullOrUndefined = isNullOrUndefined; function isNumber(arg) { return typeof arg === 'number'; } -exports.isNumber = isNumber; function isString(arg) { return typeof arg === 'string'; } -exports.isString = isString; function isSymbol(arg) { return typeof arg === 'symbol'; } -exports.isSymbol = isSymbol; function isUndefined(arg) { return arg === undefined; } -exports.isUndefined = isUndefined; function isRegExp(re) { - return binding.isRegExp(re); + return _isRegExp(re); } -exports.isRegExp = isRegExp; function isObject(arg) { return arg !== null && typeof arg === 'object'; } -exports.isObject = isObject; function isDate(d) { - return binding.isDate(d); + return _isDate(d); } -exports.isDate = isDate; - -exports.isError = isError; function isFunction(arg) { return typeof arg === 'function'; } -exports.isFunction = isFunction; function isPrimitive(arg) { return arg === null || typeof arg !== 'object' && typeof arg !== 'function'; } -exports.isPrimitive = isPrimitive; - -exports.isBuffer = Buffer.isBuffer; function pad(n) { - return n < 10 ? '0' + n.toString(10) : n.toString(10); + return n < 10 ? `0${n.toString(10)}` : n.toString(10); } @@ -932,9 +933,9 @@ function timestamp() { // log is just a thin wrapper to console.log that prepends a timestamp -exports.log = function() { +function log() { console.log('%s - %s', timestamp(), exports.format.apply(exports, arguments)); -}; +} /** @@ -952,7 +953,7 @@ exports.log = function() { * @throws {TypeError} Will error if either constructor is null, or if * the super constructor lacks a prototype. */ -exports.inherits = function(ctor, superCtor) { +function inherits(ctor, superCtor) { if (ctor === undefined || ctor === null) throw new TypeError('The constructor to "inherits" must not be ' + @@ -969,9 +970,9 @@ exports.inherits = function(ctor, superCtor) { ctor.super_ = superCtor; Object.setPrototypeOf(ctor.prototype, superCtor.prototype); -}; +} -exports._extend = function(target, source) { +function _extend(target, source) { // Don't do anything if source isn't an object if (source === null || typeof source !== 'object') return target; @@ -981,59 +982,55 @@ exports._extend = function(target, source) { target[keys[i]] = source[keys[i]]; } return target; -}; +} function hasOwnProperty(obj, prop) { - return Object.prototype.hasOwnProperty.call(obj, prop); + return objectHasOwnProperty.call(obj, prop); } // Deprecated old stuff. -exports.print = internalUtil.deprecate(function() { - for (var i = 0, len = arguments.length; i < len; ++i) { - process.stdout.write(String(arguments[i])); +function print(...args) { + for (var i = 0, len = args.length; i < len; ++i) { + process.stdout.write(String(args[i])); } -}, 'util.print is deprecated. Use console.log instead.', 'DEP0026'); - +} -exports.puts = internalUtil.deprecate(function() { - for (var i = 0, len = arguments.length; i < len; ++i) { - process.stdout.write(arguments[i] + '\n'); +function puts(...args) { + for (var i = 0, len = args.length; i < len; ++i) { + process.stdout.write(`${args[i]}\n`); } -}, 'util.puts is deprecated. Use console.log instead.', 'DEP0027'); - +} -exports.debug = internalUtil.deprecate(function(x) { +function debug(x) { process.stderr.write(`DEBUG: ${x}\n`); -}, 'util.debug is deprecated. Use console.error instead.', 'DEP0028'); - +} -exports.error = internalUtil.deprecate(function(x) { - for (var i = 0, len = arguments.length; i < len; ++i) { - process.stderr.write(arguments[i] + '\n'); +function error(...args) { + for (var i = 0, len = args.length; i < len; ++i) { + process.stderr.write(`${args[i]}\n`); } -}, 'util.error is deprecated. Use console.error instead.', 'DEP0029'); - +} -exports._errnoException = function(err, syscall, original) { - var errname = uv.errname(err); - var message = `${syscall} ${errname}`; +function _errnoException(err, syscall, original) { + var name = errname(err); + var message = `${syscall} ${name}`; if (original) - message += ' ' + original; + message += ` ${original}`; var e = new Error(message); - e.code = errname; - e.errno = errname; + e.code = name; + e.errno = name; e.syscall = syscall; return e; -}; +} -exports._exceptionWithHostPort = function(err, - syscall, - address, - port, - additional) { +function _exceptionWithHostPort(err, + syscall, + address, + port, + additional) { var details; if (port && port > 0) { details = `${address}:${port}`; @@ -1050,14 +1047,12 @@ exports._exceptionWithHostPort = function(err, ex.port = port; } return ex; -}; +} // process.versions needs a custom function as some values are lazy-evaluated. -process.versions[exports.inspect.custom] = +process.versions[inspect.custom] = () => exports.format(JSON.parse(JSON.stringify(process.versions))); -exports.promisify = internalUtil.promisify; - function callbackifyOnRejected(reason, cb) { // `!reason` guard inspired by bluebird (Ref: https://goo.gl/t5IS6M). // Because `null` is a special error value in callbacks which means "no error @@ -1106,4 +1101,60 @@ function callbackify(original) { return callbackified; } -exports.callbackify = callbackify; +// Keep the `exports =` so that various functions can still be monkeypatched +module.exports = exports = { + _errnoException, + _exceptionWithHostPort, + _extend, + callbackify, + debuglog, + deprecate, + format, + inherits, + inspect, + isArray: Array.isArray, + isBoolean, + isNull, + isNullOrUndefined, + isNumber, + isString, + isSymbol, + isUndefined, + isRegExp, + isObject, + isDate, + isError, + isFunction, + isPrimitive, + log, + promisify, + + // Deprecated Old Stuff + debug: deprecate(debug, + 'util.debug is deprecated. Use console.error instead.', + 'DEP0028'), + error: deprecate(error, + 'util.error is deprecated. Use console.error instead.', + 'DEP0029'), + print: deprecate(print, + 'util.print is deprecated. Use console.log instead.', + 'DEP0026'), + puts: deprecate(puts, + 'util.puts is deprecated. Use console.log instead.', + 'DEP0027') +}; + +// Avoid a circular dependency +var isBuffer; +Object.defineProperty(exports, 'isBuffer', { + configurable: true, + enumerable: true, + get() { + if (!isBuffer) + isBuffer = require('buffer').Buffer.isBuffer; + return isBuffer; + }, + set(val) { + isBuffer = val; + } +}); From e1f4db6ec6e6b095bf40809531f407d041f13b01 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Mon, 12 Jun 2017 08:25:53 -0700 Subject: [PATCH 2/2] util: implement WHATWG Encoding Standard API Provide an (initially experimental) implementation of the WHATWG Encoding Standard API (`TextDecoder` and `TextEncoder`). The is the same API implemented on the browser side. By default, with small-icu, only the UTF-8, UTF-16le and UTF-16be decoders are supported. With full-icu enabled, every encoding other than iso-8859-16 is supported. This provides a basic test, but does not include the full web platform tests. Note: many of the web platform tests for this would fail by default because we ship with small-icu by default. A process warning will be emitted on first use to indicate that the API is still experimental. No runtime flag is required to use the feature. Refs: https://encoding.spec.whatwg.org/ PR-URL: https://github.com/nodejs/node/pull/13644 Reviewed-By: Timothy Gu Reviewed-By: Matteo Collina --- doc/api/buffer.md | 14 +- doc/api/util.md | 151 +++++++++ lib/internal/encoding.js | 458 ++++++++++++++++++++++++++ lib/internal/errors.js | 10 + lib/util.js | 3 + node.gyp | 1 + src/node_buffer.cc | 23 ++ src/node_i18n.cc | 155 +++++++++ src/node_i18n.h | 1 + src/node_util.cc | 1 + test/parallel/test-whatwg-encoding.js | 385 ++++++++++++++++++++++ tools/icu/icu-generic.gyp | 9 - 12 files changed, 1195 insertions(+), 16 deletions(-) create mode 100644 lib/internal/encoding.js create mode 100644 test/parallel/test-whatwg-encoding.js diff --git a/doc/api/buffer.md b/doc/api/buffer.md index d73af5fd162dca..f8681c6be8e28e 100644 --- a/doc/api/buffer.md +++ b/doc/api/buffer.md @@ -193,11 +193,12 @@ The character encodings currently supported by Node.js include: * `'hex'` - Encode each byte as two hexadecimal characters. -*Note*: Today's browsers follow the [WHATWG spec] which aliases both 'latin1' -and ISO-8859-1 to win-1252. This means that while doing something like -`http.get()`, if the returned charset is one of those listed in the WHATWG spec -it's possible that the server actually returned win-1252-encoded data, and -using `'latin1'` encoding may incorrectly decode the characters. +*Note*: Today's browsers follow the [WHATWG Encoding Standard][] which aliases +both 'latin1' and ISO-8859-1 to win-1252. This means that while doing something +like `http.get()`, if the returned charset is one of those listed in the WHATWG +specification it is possible that the server actually returned +win-1252-encoded data, and using `'latin1'` encoding may incorrectly decode the +characters. ## Buffers and TypedArray + +> Stability: 1 - Experimental + +An implementation of the [WHATWG Encoding Standard][] `TextDecoder` API. + +```js +const decoder = new TextDecoder('shift_jis'); +let string = ''; +let buffer; +while (buffer = getNextChunkSomehow()) { + string += decoder.decode(buffer, { stream: true }); +} +string += decoder.decode(); // end-of-stream +``` + +#### WHATWG Supported Encodings + +Per the [WHATWG Encoding Standard][], the encodings supported by the +`TextDecoder` API are outlined in the tables below. For each encoding, +one or more aliases may be used. Support for some encodings is enabled +only when Node.js is using the full ICU data. + +##### Encodings Supported By Default + +| Encoding | Aliases | +| ----------- | --------------------------------- | +| `'utf8'` | `'unicode-1-1-utf-8'`, `'utf-8'` | +| `'utf-16be'`| | +| `'utf-16le'`| `'utf-16'` | + +##### Encodings Requiring Full-ICU + +| Encoding | Aliases | +| ----------------- | -------------------------------- | +| `'ibm866'` | `'866'`, `'cp866'`, `'csibm866'` | +| `'iso-8859-2'` | `'csisolatin2'`, `'iso-ir-101'`, `'iso8859-2'`, `'iso88592'`, `'iso_8859-2'`, `'iso_8859-2:1987'`, `'l2'`, `'latin2'` | +| `'iso-8859-3'` | `'csisolatin3'`, `'iso-ir-109'`, `'iso8859-3'`, `'iso88593'`, `'iso_8859-3'`, `'iso_8859-3:1988'`, `'l3'`, `'latin3'` | +| `'iso-8859-4'` | `'csisolatin4'`, `'iso-ir-110'`, `'iso8859-4'`, `'iso88594'`, `'iso_8859-4'`, `'iso_8859-4:1988'`, `'l4'`, `'latin4'` | +| `'iso-8859-5'` | `'csisolatincyrillic'`, `'cyrillic'`, `'iso-ir-144'`, `'iso8859-5'`, `'iso88595'`, `'iso_8859-5'`, `'iso_8859-5:1988'`| +| `'iso-8859-6'` | `'arabic'`, `'asmo-708'`, `'csiso88596e'`, `'csiso88596i'`, `'csisolatinarabic'`, `'ecma-114'`, `'iso-8859-6-e'`, `'iso-8859-6-i'`, `'iso-ir-127'`, `'iso8859-6'`, `'iso88596'`, `'iso_8859-6'`, `'iso_8859-6:1987'` | +| `'iso-8859-7'` | `'csisolatingreek'`, `'ecma-118'`, `'elot_928'`, `'greek'`, `'greek8'`, `'iso-ir-126'`, `'iso8859-7'`, `'iso88597'`, `'iso_8859-7'`, `'iso_8859-7:1987'`, `'sun_eu_greek'` | +| `'iso-8859-8'` | `'csiso88598e'`, `'csisolatinhebrew'`, `'hebrew'`, `'iso-8859-8-e'`, `'iso-ir-138'`, `'iso8859-8'`, `'iso88598'`, `'iso_8859-8'`, `'iso_8859-8:1988'`, `'visual'` | +| `'iso-8859-8-i'` | `'csiso88598i'`, `'logical'` | +| `'iso-8859-10'` | `'csisolatin6'`, `'iso-ir-157'`, `'iso8859-10'`, `'iso885910'`, `'l6'`, `'latin6'` | +| `'iso-8859-13'` | `'iso8859-13'`, `'iso885913'` | +| `'iso-8859-14'` | `'iso8859-14'`, `'iso885914'` | +| `'iso-8859-15'` | `'csisolatin9'`, `'iso8859-15'`, `'iso885915'`, `'iso_8859-15'`, `'l9'` | +| `'koi8-r'` | `'cskoi8r'`, `'koi'`, `'koi8'`, `'koi8_r'` | +| `'koi8-u'` | `'koi8-ru'` | +| `'macintosh'` | `'csmacintosh'`, `'mac'`, `'x-mac-roman'` | +| `'windows-874'` | `'dos-874'`, `'iso-8859-11'`, `'iso8859-11'`, `'iso885911'`, `'tis-620'` | +| `'windows-1250'` | `'cp1250'`, `'x-cp1250'` | +| `'windows-1251'` | `'cp1251'`, `'x-cp1251'` | +| `'windows-1252'` | `'ansi_x3.4-1968'`, `'ascii'`, `'cp1252'`, `'cp819'`, `'csisolatin1'`, `'ibm819'`, `'iso-8859-1'`, `'iso-ir-100'`, `'iso8859-1'`, `'iso88591'`, `'iso_8859-1'`, `'iso_8859-1:1987'`, `'l1'`, `'latin1'`, `'us-ascii'`, `'x-cp1252'` | +| `'windows-1253'` | `'cp1253'`, `'x-cp1253'` | +| `'windows-1254'` | `'cp1254'`, `'csisolatin5'`, `'iso-8859-9'`, `'iso-ir-148'`, `'iso8859-9'`, `'iso88599'`, `'iso_8859-9'`, `'iso_8859-9:1989'`, `'l5'`, `'latin5'`, `'x-cp1254'` | +| `'windows-1255'` | `'cp1255'`, `'x-cp1255'` | +| `'windows-1256'` | `'cp1256'`, `'x-cp1256'` | +| `'windows-1257'` | `'cp1257'`, `'x-cp1257'` | +| `'windows-1258'` | `'cp1258'`, `'x-cp1258'` | +| `'x-mac-cyrillic'`| `'x-mac-ukrainian'` | +| `'gbk'` | `'chinese'`, `'csgb2312'`, `'csiso58gb231280'`, `'gb2312'`, `'gb_2312'`, `'gb_2312-80'`, `'iso-ir-58'`, `'x-gbk'` | +| `'gb18030'` | | +| `'big5'` | `'big5-hkscs'`, `'cn-big5'`, `'csbig5'`, `'x-x-big5'` | +| `'euc-jp'` | `'cseucpkdfmtjapanese'`, `'x-euc-jp'` | +| `'iso-2022-jp'` | `'csiso2022jp'` | +| `'shift_jis'` | `'csshiftjis'`, `'ms932'`, `'ms_kanji'`, `'shift-jis'`, `'sjis'`, `'windows-31j'`, `'x-sjis'` | +| `'euc-kr'` | `'cseuckr'`, `'csksc56011987'`, `'iso-ir-149'`, `'korean'`, `'ks_c_5601-1987'`, `'ks_c_5601-1989'`, `'ksc5601'`, `'ksc_5601'`, `'windows-949'` | + +*Note*: The `'iso-8859-16'` encoding listed in the [WHATWG Encoding Standard][] +is not supported. + +#### new TextDecoder([encoding[, options]]) + +* `encoding` {string} Identifies the `encoding` that this `TextDecoder` instance + supports. Defaults to `'utf-8'`. +* `options` {Object} + * `fatal` {boolean} `true` if decoding failures are fatal. Defaults to + `false`. + * `ignoreBOM` {boolean} When `true`, the `TextDecoder` will include the byte + order mark in the decoded result. When `false`, the byte order mark will + be removed from the output. This option is only used when `encoding` is + `'utf-8'`, `'utf-16be'` or `'utf-16le'`. Defaults to `false`. + +Creates an new `TextDecoder` instance. The `encoding` may specify one of the +supported encodings or an alias. + +#### textDecoder.decode([input[, options]]) + +* `input` {ArrayBuffer|DataView|TypedArray} An `ArrayBuffer`, `DataView` or + Typed Array instance containing the encoded data. +* `options` {Object} + * `stream` {boolean} `true` if additional chunks of data are expected. + Defaults to `false`. +* Returns: {string} + +Decodes the `input` and returns a string. If `options.stream` is `true`, any +incomplete byte sequences occuring at the end of the `input` are buffered +internally and emitted after the next call to `textDecoder.decode()`. + +If `textDecoder.fatal` is `true`, decoding errors that occur will result in a +`TypeError` being thrown. + +#### textDecoder.encoding + +* Value: {string} + +The encoding supported by the `TextDecoder` instance. + +#### textDecoder.fatal + +* Value: {boolean} + +The value will be `true` if decoding errors result in a `TypeError` being +thrown. + +#### textDecoder.ignoreBOM + +* Value: {boolean} + +The value will be `true` if the decoding result will include the byte order +mark. + +### Class: util.TextEncoder + + +> Stability: 1 - Experimental + +An implementation of the [WHATWG Encoding Standard][] `TextEncoder` API. All +instances of `TextEncoder` only support `UTF-8` encoding. + +```js +const encoder = new TextEncoder(); +const uint8array = encoder.encode('this is some data'); +``` + +#### textEncoder.encode([input]) + +* `input` {string} The text to encode. Defaults to an empty string. +* Returns: {Uint8Array} + +UTF-8 Encodes the `input` string and returns a `Uint8Array` containing the +encoded bytes. + ## Deprecated APIs The following APIs have been deprecated and should no longer be used. Existing @@ -1022,3 +1172,4 @@ Deprecated predecessor of `console.log`. [Custom promisified functions]: #util_custom_promisified_functions [constructor]: https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/Object/constructor [semantically incompatible]: https://github.com/nodejs/node/issues/4179 +[WHATWG Encoding Standard]: https://encoding.spec.whatwg.org/ diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js new file mode 100644 index 00000000000000..22ae5c6c0db1ab --- /dev/null +++ b/lib/internal/encoding.js @@ -0,0 +1,458 @@ +'use strict'; + +// An implementation of the WHATWG Encoding Standard +// https://encoding.spec.whatwg.org + +const errors = require('internal/errors'); +const kHandle = Symbol('handle'); +const kFlags = Symbol('flags'); +const kEncoding = Symbol('encoding'); +const kDecoder = Symbol('decoder'); +const kEncoder = Symbol('encoder'); + +let warned = false; +const experimental = + 'The WHATWG Encoding Standard implementation is an experimental API. It ' + + 'should not yet be used in production applications.'; + +const { + getConstructorOf, + customInspectSymbol: inspect +} = require('internal/util'); + +const { + isArrayBuffer +} = process.binding('util'); + +const { + encodeUtf8String +} = process.binding('buffer'); + +const { + decode: _decode, + getConverter, + hasConverter +} = process.binding('icu'); + +const CONVERTER_FLAGS_FLUSH = 0x1; +const CONVERTER_FLAGS_FATAL = 0x2; +const CONVERTER_FLAGS_IGNORE_BOM = 0x4; + +const empty = new Uint8Array(0); + +const encodings = new Map([ + ['unicode-1-1-utf-8', 'utf-8'], + ['utf8', 'utf-8'], + ['utf-8', 'utf-8'], + ['866', 'ibm866'], + ['cp866', 'ibm866'], + ['csibm866', 'ibm866'], + ['ibm866', 'ibm866'], + ['csisolatin2', 'iso-8859-2'], + ['iso-8859-2', 'iso-8859-2'], + ['iso-ir-101', 'iso-8859-2'], + ['iso8859-2', 'iso-8859-2'], + ['iso88592', 'iso-8859-2'], + ['iso_8859-2', 'iso-8859-2'], + ['iso_8859-2:1987', 'iso-8859-2'], + ['l2', 'iso-8859-2'], + ['latin2', 'iso-8859-2'], + ['csisolatin3', 'iso-8859-3'], + ['iso-8859-3', 'iso-8859-3'], + ['iso-ir-109', 'iso-8859-3'], + ['iso8859-3', 'iso-8859-3'], + ['iso88593', 'iso-8859-3'], + ['iso_8859-3', 'iso-8859-3'], + ['iso_8859-3:1988', 'iso-8859-3'], + ['l3', 'iso-8859-3'], + ['latin3', 'iso-8859-3'], + ['csisolatin4', 'iso-8859-4'], + ['iso-8859-4', 'iso-8859-4'], + ['iso-ir-110', 'iso-8859-4'], + ['iso8859-4', 'iso-8859-4'], + ['iso88594', 'iso-8859-4'], + ['iso_8859-4', 'iso-8859-4'], + ['iso_8859-4:1988', 'iso-8859-4'], + ['l4', 'iso-8859-4'], + ['latin4', 'iso-8859-4'], + ['csisolatincyrillic', 'iso-8859-5'], + ['cyrillic', 'iso-8859-5'], + ['iso-8859-5', 'iso-8859-5'], + ['iso-ir-144', 'iso-8859-5'], + ['iso8859-5', 'iso-8859-5'], + ['iso88595', 'iso-8859-5'], + ['iso_8859-5', 'iso-8859-5'], + ['iso_8859-5:1988', 'iso-8859-5'], + ['arabic', 'iso-8859-6'], + ['asmo-708', 'iso-8859-6'], + ['csiso88596e', 'iso-8859-6'], + ['csiso88596i', 'iso-8859-6'], + ['csisolatinarabic', 'iso-8859-6'], + ['ecma-114', 'iso-8859-6'], + ['iso-8859-6', 'iso-8859-6'], + ['iso-8859-6-e', 'iso-8859-6'], + ['iso-8859-6-i', 'iso-8859-6'], + ['iso-ir-127', 'iso-8859-6'], + ['iso8859-6', 'iso-8859-6'], + ['iso88596', 'iso-8859-6'], + ['iso_8859-6', 'iso-8859-6'], + ['iso_8859-6:1987', 'iso-8859-6'], + ['csisolatingreek', 'iso-8859-7'], + ['ecma-118', 'iso-8859-7'], + ['elot_928', 'iso-8859-7'], + ['greek', 'iso-8859-7'], + ['greek8', 'iso-8859-7'], + ['iso-8859-7', 'iso-8859-7'], + ['iso-ir-126', 'iso-8859-7'], + ['iso8859-7', 'iso-8859-7'], + ['iso88597', 'iso-8859-7'], + ['iso_8859-7', 'iso-8859-7'], + ['iso_8859-7:1987', 'iso-8859-7'], + ['sun_eu_greek', 'iso-8859-7'], + ['csiso88598e', 'iso-8859-8'], + ['csisolatinhebrew', 'iso-8859-8'], + ['hebrew', 'iso-8859-8'], + ['iso-8859-8', 'iso-8859-8'], + ['iso-8859-8-e', 'iso-8859-8'], + ['iso-ir-138', 'iso-8859-8'], + ['iso8859-8', 'iso-8859-8'], + ['iso88598', 'iso-8859-8'], + ['iso_8859-8', 'iso-8859-8'], + ['iso_8859-8:1988', 'iso-8859-8'], + ['visual', 'iso-8859-8'], + ['csiso88598i', 'iso-8859-8-i'], + ['iso-8859-8-i', 'iso-8859-8-i'], + ['logical', 'iso-8859-8-i'], + ['csisolatin6', 'iso-8859-10'], + ['iso-8859-10', 'iso-8859-10'], + ['iso-ir-157', 'iso-8859-10'], + ['iso8859-10', 'iso-8859-10'], + ['iso885910', 'iso-8859-10'], + ['l6', 'iso-8859-10'], + ['latin6', 'iso-8859-10'], + ['iso-8859-13', 'iso-8859-13'], + ['iso8859-13', 'iso-8859-13'], + ['iso885913', 'iso-8859-13'], + ['iso-8859-14', 'iso-8859-14'], + ['iso8859-14', 'iso-8859-14'], + ['iso885914', 'iso-8859-14'], + ['csisolatin9', 'iso-8859-15'], + ['iso-8859-15', 'iso-8859-15'], + ['iso8859-15', 'iso-8859-15'], + ['iso885915', 'iso-8859-15'], + ['iso_8859-15', 'iso-8859-15'], + ['l9', 'iso-8859-15'], + ['cskoi8r', 'koi8-r'], + ['koi', 'koi8-r'], + ['koi8', 'koi8-r'], + ['koi8-r', 'koi8-r'], + ['koi8_r', 'koi8-r'], + ['koi8-ru', 'koi8-u'], + ['koi8-u', 'koi8-u'], + ['csmacintosh', 'macintosh'], + ['mac', 'macintosh'], + ['macintosh', 'macintosh'], + ['x-mac-roman', 'macintosh'], + ['dos-874', 'windows-874'], + ['iso-8859-11', 'windows-874'], + ['iso8859-11', 'windows-874'], + ['iso885911', 'windows-874'], + ['tis-620', 'windows-874'], + ['windows-874', 'windows-874'], + ['cp1250', 'windows-1250'], + ['windows-1250', 'windows-1250'], + ['x-cp1250', 'windows-1250'], + ['cp1251', 'windows-1251'], + ['windows-1251', 'windows-1251'], + ['x-cp1251', 'windows-1251'], + ['ansi_x3.4-1968', 'windows-1252'], + ['ascii', 'windows-1252'], + ['cp1252', 'windows-1252'], + ['cp819', 'windows-1252'], + ['csisolatin1', 'windows-1252'], + ['ibm819', 'windows-1252'], + ['iso-8859-1', 'windows-1252'], + ['iso-ir-100', 'windows-1252'], + ['iso8859-1', 'windows-1252'], + ['iso88591', 'windows-1252'], + ['iso_8859-1', 'windows-1252'], + ['iso_8859-1:1987', 'windows-1252'], + ['l1', 'windows-1252'], + ['latin1', 'windows-1252'], + ['us-ascii', 'windows-1252'], + ['windows-1252', 'windows-1252'], + ['x-cp1252', 'windows-1252'], + ['cp1253', 'windows-1253'], + ['windows-1253', 'windows-1253'], + ['x-cp1253', 'windows-1253'], + ['cp1254', 'windows-1254'], + ['csisolatin5', 'windows-1254'], + ['iso-8859-9', 'windows-1254'], + ['iso-ir-148', 'windows-1254'], + ['iso8859-9', 'windows-1254'], + ['iso88599', 'windows-1254'], + ['iso_8859-9', 'windows-1254'], + ['iso_8859-9:1989', 'windows-1254'], + ['l5', 'windows-1254'], + ['latin5', 'windows-1254'], + ['windows-1254', 'windows-1254'], + ['x-cp1254', 'windows-1254'], + ['cp1255', 'windows-1255'], + ['windows-1255', 'windows-1255'], + ['x-cp1255', 'windows-1255'], + ['cp1256', 'windows-1256'], + ['windows-1256', 'windows-1256'], + ['x-cp1256', 'windows-1256'], + ['cp1257', 'windows-1257'], + ['windows-1257', 'windows-1257'], + ['x-cp1257', 'windows-1257'], + ['cp1258', 'windows-1258'], + ['windows-1258', 'windows-1258'], + ['x-cp1258', 'windows-1258'], + ['x-mac-cyrillic', 'x-mac-cyrillic'], + ['x-mac-ukrainian', 'x-mac-cyrillic'], + ['chinese', 'gbk'], + ['csgb2312', 'gbk'], + ['csiso58gb231280', 'gbk'], + ['gb2312', 'gbk'], + ['gb_2312', 'gbk'], + ['gb_2312-80', 'gbk'], + ['gbk', 'gbk'], + ['iso-ir-58', 'gbk'], + ['x-gbk', 'gbk'], + ['gb18030', 'gb18030'], + ['big5', 'big5'], + ['big5-hkscs', 'big5'], + ['cn-big5', 'big5'], + ['csbig5', 'big5'], + ['x-x-big5', 'big5'], + ['cseucpkdfmtjapanese', 'euc-jp'], + ['euc-jp', 'euc-jp'], + ['x-euc-jp', 'euc-jp'], + ['csiso2022jp', 'iso-2022-jp'], + ['iso-2022-jp', 'iso-2022-jp'], + ['csshiftjis', 'shift_jis'], + ['ms932', 'shift_jis'], + ['ms_kanji', 'shift_jis'], + ['shift-jis', 'shift_jis'], + ['shift_jis', 'shift_jis'], + ['sjis', 'shift_jis'], + ['windows-31j', 'shift_jis'], + ['x-sjis', 'shift_jis'], + ['cseuckr', 'euc-kr'], + ['csksc56011987', 'euc-kr'], + ['euc-kr', 'euc-kr'], + ['iso-ir-149', 'euc-kr'], + ['korean', 'euc-kr'], + ['ks_c_5601-1987', 'euc-kr'], + ['ks_c_5601-1989', 'euc-kr'], + ['ksc5601', 'euc-kr'], + ['ksc_5601', 'euc-kr'], + ['windows-949', 'euc-kr'], + ['utf-16be', 'utf-16be'], + ['utf-16le', 'utf-16le'], + ['utf-16', 'utf-16le'] +]); + +// Unfortunately, String.prototype.trim also removes non-ascii whitespace, +// so we have to do this manually +function trimAsciiWhitespace(label) { + var s = 0; + var e = label.length; + while (s < e && ( + label[s] === '\u0009' || + label[s] === '\u000a' || + label[s] === '\u000c' || + label[s] === '\u000d' || + label[s] === '\u0020')) { + s++; + } + while (e > s && ( + label[e - 1] === '\u0009' || + label[e - 1] === '\u000a' || + label[e - 1] === '\u000c' || + label[e - 1] === '\u000d' || + label[e - 1] === '\u0020')) { + e--; + } + return label.slice(s, e); +} + +function getEncodingFromLabel(label) { + const enc = encodings.get(label); + if (enc !== undefined) return enc; + return encodings.get(trimAsciiWhitespace(label.toLowerCase())); +} + +function hasTextDecoder(encoding = 'utf-8') { + if (typeof encoding !== 'string') + throw new errors.Error('ERR_INVALID_ARG_TYPE', 'encoding', 'string'); + return hasConverter(getEncodingFromLabel(encoding)); +} + +var Buffer; +function lazyBuffer() { + if (Buffer === undefined) + Buffer = require('buffer').Buffer; + return Buffer; +} + +class TextDecoder { + constructor(encoding = 'utf-8', options = {}) { + if (!warned) { + warned = true; + process.emitWarning(experimental, 'ExperimentalWarning'); + } + + encoding = `${encoding}`; + if (typeof options !== 'object') + throw new errors.Error('ERR_INVALID_ARG_TYPE', 'options', 'object'); + + const enc = getEncodingFromLabel(encoding); + if (enc === undefined) + throw new errors.RangeError('ERR_ENCODING_NOT_SUPPORTED', encoding); + + var flags = 0; + if (options !== null) { + flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0; + flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; + } + + const handle = getConverter(enc, flags); + if (handle === undefined) + throw new errors.Error('ERR_ENCODING_NOT_SUPPORTED', encoding); + + this[kHandle] = handle; + this[kFlags] = flags; + this[kEncoding] = enc; + } + + get encoding() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return this[kEncoding]; + } + + get fatal() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return (this[kFlags] & CONVERTER_FLAGS_FATAL) === CONVERTER_FLAGS_FATAL; + } + + get ignoreBOM() { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + return (this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM) === + CONVERTER_FLAGS_IGNORE_BOM; + } + + decode(input = empty, options = {}) { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + if (isArrayBuffer(input)) { + input = lazyBuffer().from(input); + } else if (!ArrayBuffer.isView(input)) { + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'input', + ['ArrayBuffer', 'ArrayBufferView']); + } + if (typeof options !== 'object') { + throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'options', 'object'); + } + + var flags = 0; + if (options !== null) + flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; + + const ret = _decode(this[kHandle], input, flags); + if (typeof ret === 'number') { + const err = new errors.TypeError('ERR_ENCODING_INVALID_ENCODED_DATA', + this.encoding); + err.errno = ret; + throw err; + } + return ret.toString('ucs2'); + } + + [inspect](depth, opts) { + if (this == null || this[kDecoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextDecoder'); + if (typeof depth === 'number' && depth < 0) + return opts.stylize('[Object]', 'special'); + var ctor = getConstructorOf(this); + var obj = Object.create({ + constructor: ctor === null ? TextDecoder : ctor + }); + obj.encoding = this.encoding; + obj.fatal = this.fatal; + obj.ignoreBOM = this.ignoreBOM; + if (opts.showHidden) { + obj[kFlags] = this[kFlags]; + obj[kHandle] = this[kHandle]; + } + // Lazy to avoid circular dependency + return require('util').inspect(obj, opts); + } +} + +class TextEncoder { + constructor() { + if (!warned) { + warned = true; + process.emitWarning(experimental, 'ExperimentalWarning'); + } + } + + get encoding() { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + return 'utf-8'; + } + + encode(input = '') { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + return encodeUtf8String(`${input}`); + } + + [inspect](depth, opts) { + if (this == null || this[kEncoder] !== true) + throw new errors.TypeError('ERR_INVALID_THIS', 'TextEncoder'); + if (typeof depth === 'number' && depth < 0) + return opts.stylize('[Object]', 'special'); + var ctor = getConstructorOf(this); + var obj = Object.create({ + constructor: ctor === null ? TextEncoder : ctor + }); + obj.encoding = this.encoding; + // Lazy to avoid circular dependency + return require('util').inspect(obj, opts); + } +} + +Object.defineProperties( + TextDecoder.prototype, { + [kDecoder]: { enumerable: false, value: true, configurable: false }, + 'decode': { enumerable: true }, + 'encoding': { enumerable: true }, + 'fatal': { enumerable: true }, + 'ignoreBOM': { enumerable: true }, + [Symbol.toStringTag]: { + configurable: true, + value: 'TextDecoder' + } }); +Object.defineProperties( + TextEncoder.prototype, { + [kEncoder]: { enumerable: false, value: true, configurable: false }, + 'encode': { enumerable: true }, + 'encoding': { enumerable: true }, + [Symbol.toStringTag]: { + configurable: true, + value: 'TextEncoder' + } }); + +module.exports = { + getEncodingFromLabel, + hasTextDecoder, + TextDecoder, + TextEncoder +}; diff --git a/lib/internal/errors.js b/lib/internal/errors.js index f519397be252b1..3ee34cf428939a 100644 --- a/lib/internal/errors.js +++ b/lib/internal/errors.js @@ -109,7 +109,17 @@ module.exports = exports = { // Note: Please try to keep these in alphabetical order E('ERR_ARG_NOT_ITERABLE', '%s must be iterable'); E('ERR_ASSERTION', (msg) => msg); +E('ERR_ENCODING_INVALID_ENCODED_DATA', + (enc) => `The encoded data was not valid for encoding ${enc}`); +E('ERR_ENCODING_NOT_SUPPORTED', + (enc) => `The "${enc}" encoding is not supported`); E('ERR_FALSY_VALUE_REJECTION', 'Promise was rejected with falsy value'); +E('ERR_HTTP_HEADERS_SENT', + 'Cannot render headers after they are sent to the client'); +E('ERR_HTTP_INVALID_STATUS_CODE', 'Invalid status code: %s'); +E('ERR_HTTP_TRAILER_INVALID', + 'Trailers are invalid with this transfer encoding'); +E('ERR_INDEX_OUT_OF_RANGE', 'Index out of range'); E('ERR_INVALID_ARG_TYPE', invalidArgType); E('ERR_INVALID_CALLBACK', 'callback must be a function'); E('ERR_INVALID_FD', (fd) => `"fd" must be a positive integer: ${fd}`); diff --git a/lib/util.js b/lib/util.js index 897970d39b1829..9db58e5458d8db 100644 --- a/lib/util.js +++ b/lib/util.js @@ -22,6 +22,7 @@ 'use strict'; const errors = require('internal/errors'); +const { TextDecoder, TextEncoder } = require('internal/encoding'); const { errname } = process.binding('uv'); @@ -1128,6 +1129,8 @@ module.exports = exports = { isPrimitive, log, promisify, + TextDecoder, + TextEncoder, // Deprecated Old Stuff debug: deprecate(debug, diff --git a/node.gyp b/node.gyp index 7879ef2524d01f..1650f1598bf02a 100644 --- a/node.gyp +++ b/node.gyp @@ -82,6 +82,7 @@ 'lib/internal/cluster/shared_handle.js', 'lib/internal/cluster/utils.js', 'lib/internal/cluster/worker.js', + 'lib/internal/encoding.js', 'lib/internal/errors.js', 'lib/internal/freelist.js', 'lib/internal/fs.js', diff --git a/src/node_buffer.cc b/src/node_buffer.cc index a88d0e86732252..d6e4aa9da072db 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1200,6 +1200,27 @@ void Swap64(const FunctionCallbackInfo& args) { } +// Encode a single string to a UTF-8 Uint8Array (not Buffer). +// Used in TextEncoder.prototype.encode. +static void EncodeUtf8String(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsString()); + + Local str = args[0].As(); + size_t length = str->Utf8Length(); + char* data = node::UncheckedMalloc(length); + str->WriteUtf8(data, + -1, // We are certain that `data` is sufficiently large + NULL, + String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8); + auto array_buf = ArrayBuffer::New(env->isolate(), data, length, + ArrayBufferCreationMode::kInternalized); + auto array = Uint8Array::New(array_buf, 0, length); + args.GetReturnValue().Set(array); +} + + // pass Buffer object to load prototype methods void SetupBufferJS(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); @@ -1266,6 +1287,8 @@ void Initialize(Local target, env->SetMethod(target, "swap32", Swap32); env->SetMethod(target, "swap64", Swap64); + env->SetMethod(target, "encodeUtf8String", EncodeUtf8String); + target->Set(env->context(), FIXED_ONE_BYTE_STRING(env->isolate(), "kMaxLength"), Integer::NewFromUnsigned(env->isolate(), kMaxLength)).FromJust(); diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 3b337449495f4c..2e1aeaa4cb07c5 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -50,6 +50,8 @@ #include "env-inl.h" #include "util.h" #include "util-inl.h" +#include "base-object.h" +#include "base-object-inl.h" #include "v8.h" #include @@ -86,10 +88,12 @@ namespace node { using v8::Context; using v8::FunctionCallbackInfo; +using v8::HandleScope; using v8::Isolate; using v8::Local; using v8::MaybeLocal; using v8::Object; +using v8::ObjectTemplate; using v8::String; using v8::Value; @@ -123,6 +127,15 @@ struct Converter { } } + explicit Converter(UConverter* converter, + const char* sub = NULL) : conv(converter) { + CHECK_NE(conv, nullptr); + UErrorCode status = U_ZERO_ERROR; + if (sub != NULL) { + ucnv_setSubstChars(conv, sub, strlen(sub), &status); + } + } + ~Converter() { ucnv_close(conv); } @@ -130,6 +143,143 @@ struct Converter { UConverter* conv; }; +class ConverterObject : public BaseObject, Converter { + public: + enum ConverterFlags { + CONVERTER_FLAGS_FLUSH = 0x1, + CONVERTER_FLAGS_FATAL = 0x2, + CONVERTER_FLAGS_IGNORE_BOM = 0x4 + }; + + ~ConverterObject() override {} + + static void Has(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + HandleScope scope(env->isolate()); + + CHECK_GE(args.Length(), 1); + Utf8Value label(env->isolate(), args[0]); + + UErrorCode status = U_ZERO_ERROR; + UConverter* conv = ucnv_open(*label, &status); + args.GetReturnValue().Set(!!U_SUCCESS(status)); + ucnv_close(conv); + } + + static void Create(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + HandleScope scope(env->isolate()); + + CHECK_GE(args.Length(), 2); + Utf8Value label(env->isolate(), args[0]); + int flags = args[1]->Uint32Value(env->context()).ToChecked(); + bool fatal = + (flags & CONVERTER_FLAGS_FATAL) == CONVERTER_FLAGS_FATAL; + bool ignoreBOM = + (flags & CONVERTER_FLAGS_IGNORE_BOM) == CONVERTER_FLAGS_IGNORE_BOM; + + UErrorCode status = U_ZERO_ERROR; + UConverter* conv = ucnv_open(*label, &status); + if (U_FAILURE(status)) + return; + + if (fatal) { + status = U_ZERO_ERROR; + ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, + nullptr, nullptr, nullptr, &status); + } + + Local t = ObjectTemplate::New(env->isolate()); + t->SetInternalFieldCount(1); + Local obj = t->NewInstance(env->context()).ToLocalChecked(); + new ConverterObject(env, obj, conv, ignoreBOM); + args.GetReturnValue().Set(obj); + } + + static void Decode(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); + + CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags + + Converter utf8("utf8"); + ConverterObject* converter; + ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As()); + SPREAD_BUFFER_ARG(args[1], input_obj); + int flags = args[2]->Uint32Value(env->context()).ToChecked(); + + UErrorCode status = U_ZERO_ERROR; + MaybeStackBuffer result; + MaybeLocal ret; + size_t limit = ucnv_getMinCharSize(converter->conv) * + input_obj_length; + if (limit > 0) + result.AllocateSufficientStorage(limit); + + UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH; + + const char* source = input_obj_data; + size_t source_length = input_obj_length; + + if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) { + int32_t bomOffset = 0; + ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status); + source += bomOffset; + source_length -= bomOffset; + converter->bomSeen_ = true; + } + + UChar* target = *result; + ucnv_toUnicode(converter->conv, + &target, target + (limit * sizeof(UChar)), + &source, source + source_length, + NULL, flush, &status); + + if (U_SUCCESS(status)) { + if (limit > 0) + result.SetLength(target - &result[0]); + ret = ToBufferEndian(env, &result); + args.GetReturnValue().Set(ret.ToLocalChecked()); + goto reset; + } + + args.GetReturnValue().Set(status); + + reset: + if (flush) { + // Reset the converter state + converter->bomSeen_ = false; + ucnv_reset(converter->conv); + } + } + + protected: + ConverterObject(Environment* env, + v8::Local wrap, + UConverter* converter, + bool ignoreBOM, + const char* sub = NULL) : + BaseObject(env, wrap), + Converter(converter, sub), + ignoreBOM_(ignoreBOM) { + MakeWeak(this); + + switch (ucnv_getType(converter)) { + case UCNV_UTF8: + case UCNV_UTF16_BigEndian: + case UCNV_UTF16_LittleEndian: + unicode_ = true; + break; + default: + unicode_ = false; + } + } + + private: + bool unicode_ = false; // True if this is a Unicode converter + bool ignoreBOM_ = false; // True if the BOM should be ignored on Unicode + bool bomSeen_ = false; // True if the BOM has been seen +}; + // One-Shot Converters void CopySourceBuffer(MaybeStackBuffer* dest, @@ -717,6 +867,11 @@ void Init(Local target, // One-shot converters env->SetMethod(target, "icuErrName", ICUErrorName); env->SetMethod(target, "transcode", Transcode); + + // ConverterObject + env->SetMethod(target, "getConverter", ConverterObject::Create); + env->SetMethod(target, "decode", ConverterObject::Decode); + env->SetMethod(target, "hasConverter", ConverterObject::Has); } } // namespace i18n diff --git a/src/node_i18n.h b/src/node_i18n.h index adf9feb414df5c..f7801ce6668468 100644 --- a/src/node_i18n.h +++ b/src/node_i18n.h @@ -25,6 +25,7 @@ #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS #include "node.h" +#include #include #if defined(NODE_HAVE_I18N_SUPPORT) diff --git a/src/node_util.cc b/src/node_util.cc index 50de94bfb2bf3a..c1dff77386d927 100644 --- a/src/node_util.cc +++ b/src/node_util.cc @@ -21,6 +21,7 @@ using v8::Value; #define VALUE_METHOD_MAP(V) \ + V(isArrayBuffer, IsArrayBuffer) \ V(isAsyncFunction, IsAsyncFunction) \ V(isDataView, IsDataView) \ V(isDate, IsDate) \ diff --git a/test/parallel/test-whatwg-encoding.js b/test/parallel/test-whatwg-encoding.js new file mode 100644 index 00000000000000..c181df860ca149 --- /dev/null +++ b/test/parallel/test-whatwg-encoding.js @@ -0,0 +1,385 @@ +// Flags: --expose-internals +'use strict'; + +const common = require('../common'); +const assert = require('assert'); +const { TextEncoder, TextDecoder } = require('util'); +const { customInspectSymbol: inspect } = require('internal/util'); +const { getEncodingFromLabel } = require('internal/encoding'); + +const encoded = Buffer.from([0xef, 0xbb, 0xbf, 0x74, 0x65, + 0x73, 0x74, 0xe2, 0x82, 0xac]); + +if (!common.hasIntl) { + common.skip('WHATWG Encoding tests because ICU is not present.'); +} + +// Make Sure TextDecoder and TextEncoder exist +assert(TextDecoder); +assert(TextEncoder); + +// Test TextEncoder +const enc = new TextEncoder(); +assert(enc); +const buf = enc.encode('\ufefftest€'); + +assert.strictEqual(Buffer.compare(buf, encoded), 0); + + +// Test TextDecoder, UTF-8, fatal: false, ignoreBOM: false +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i); + const res = dec.decode(buf); + assert.strictEqual(res, 'test€'); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i); + let res = ''; + res += dec.decode(buf.slice(0, 8), { stream: true }); + res += dec.decode(buf.slice(8)); + assert.strictEqual(res, 'test€'); + }); +} + +// Test TextDecoder, UTF-8, fatal: false, ignoreBOM: true +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { ignoreBOM: true }); + const res = dec.decode(buf); + assert.strictEqual(res, '\ufefftest€'); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { ignoreBOM: true }); + let res = ''; + res += dec.decode(buf.slice(0, 8), { stream: true }); + res += dec.decode(buf.slice(8)); + assert.strictEqual(res, '\ufefftest€'); + }); +} + +// Test TextDecoder, UTF-8, fatal: true, ignoreBOM: false +{ + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { fatal: true }); + assert.throws(() => dec.decode(buf.slice(0, 8)), + common.expectsError({ + code: 'ERR_ENCODING_INVALID_ENCODED_DATA', + type: TypeError, + message: + /^The encoded data was not valid for encoding utf-8$/ + })); + }); + + ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { + const dec = new TextDecoder(i, { fatal: true }); + assert.doesNotThrow(() => dec.decode(buf.slice(0, 8), { stream: true })); + assert.doesNotThrow(() => dec.decode(buf.slice(8))); + }); +} + +// Test TextDecoder, UTF-16le +{ + const dec = new TextDecoder('utf-16le'); + const res = dec.decode(Buffer.from('test€', 'utf-16le')); + assert.strictEqual(res, 'test€'); +} + +// Test TextDecoder, UTF-16be +{ + const dec = new TextDecoder('utf-16be'); + const res = dec.decode(Buffer.from([0x00, 0x74, 0x00, 0x65, 0x00, + 0x73, 0x00, 0x74, 0x20, 0xac])); + assert.strictEqual(res, 'test€'); +} + +{ + const fn = TextDecoder.prototype[inspect]; + fn.call(new TextDecoder(), Infinity, {}); + + [{}, [], true, 1, '', new TextEncoder()].forEach((i) => { + assert.throws(() => fn.call(i, Infinity, {}), + common.expectsError({ + code: 'ERR_INVALID_THIS', + message: 'Value of "this" must be of type TextDecoder' + })); + }); +} + +{ + const fn = TextEncoder.prototype[inspect]; + fn.call(new TextEncoder(), Infinity, {}); + + [{}, [], true, 1, '', new TextDecoder()].forEach((i) => { + assert.throws(() => fn.call(i, Infinity, {}), + common.expectsError({ + code: 'ERR_INVALID_THIS', + message: 'Value of "this" must be of type TextEncoder' + })); + }); +} + +// Test Encoding Mappings +{ + + const mappings = { + 'utf-8': [ + 'unicode-1-1-utf-8', + 'utf8' + ], + 'utf-16be': [], + 'utf-16le': [ + 'utf-16' + ], + 'ibm866': [ + '866', + 'cp866', + 'csibm866' + ], + 'iso-8859-2': [ + 'csisolatin2', + 'iso-ir-101', + 'iso8859-2', + 'iso88592', + 'iso_8859-2', + 'iso_8859-2:1987', + 'l2', + 'latin2' + ], + 'iso-8859-3': [ + 'csisolatin3', + 'iso-ir-109', + 'iso8859-3', + 'iso88593', + 'iso_8859-3', + 'iso_8859-3:1988', + 'l3', + 'latin3' + ], + 'iso-8859-4': [ + 'csisolatin4', + 'iso-ir-110', + 'iso8859-4', + 'iso88594', + 'iso_8859-4', + 'iso_8859-4:1988', + 'l4', + 'latin4' + ], + 'iso-8859-5': [ + 'csisolatincyrillic', + 'cyrillic', + 'iso-ir-144', + 'iso8859-5', + 'iso88595', + 'iso_8859-5', + 'iso_8859-5:1988' + ], + 'iso-8859-6': [ + 'arabic', + 'asmo-708', + 'csiso88596e', + 'csiso88596i', + 'csisolatinarabic', + 'ecma-114', + 'iso-8859-6-e', + 'iso-8859-6-i', + 'iso-ir-127', + 'iso8859-6', + 'iso88596', + 'iso_8859-6', + 'iso_8859-6:1987' + ], + 'iso-8859-7': [ + 'csisolatingreek', + 'ecma-118', + 'elot_928', + 'greek', + 'greek8', + 'iso-ir-126', + 'iso8859-7', + 'iso88597', + 'iso_8859-7', + 'iso_8859-7:1987', + 'sun_eu_greek' + ], + 'iso-8859-8': [ + 'csiso88598e', + 'csisolatinhebrew', + 'hebrew', + 'iso-8859-8-e', + 'iso-ir-138', + 'iso8859-8', + 'iso88598', + 'iso_8859-8', + 'iso_8859-8:1988', + 'visual' + ], + 'iso-8859-8-i': [ + 'csiso88598i', + 'logical' + ], + 'iso-8859-10': [ + 'csisolatin6', + 'iso-ir-157', + 'iso8859-10', + 'iso885910', + 'l6', + 'latin6' + ], + 'iso-8859-13': [ + 'iso8859-13', + 'iso885913' + ], + 'iso-8859-14': [ + 'iso8859-14', + 'iso885914' + ], + 'iso-8859-15': [ + 'csisolatin9', + 'iso8859-15', + 'iso885915', + 'iso_8859-15', + 'l9' + ], + 'koi8-r': [ + 'cskoi8r', + 'koi', + 'koi8', + 'koi8_r' + ], + 'koi8-u': [ + 'koi8-ru' + ], + 'macintosh': [ + 'csmacintosh', + 'mac', + 'x-mac-roman' + ], + 'windows-874': [ + 'dos-874', + 'iso-8859-11', + 'iso8859-11', + 'iso885911', + 'tis-620' + ], + 'windows-1250': [ + 'cp1250', + 'x-cp1250' + ], + 'windows-1251': [ + 'cp1251', + 'x-cp1251' + ], + 'windows-1252': [ + 'ansi_x3.4-1968', + 'ascii', + 'cp1252', + 'cp819', + 'csisolatin1', + 'ibm819', + 'iso-8859-1', + 'iso-ir-100', + 'iso8859-1', + 'iso88591', + 'iso_8859-1', + 'iso_8859-1:1987', + 'l1', + 'latin1', + 'us-ascii', + 'x-cp1252' + ], + 'windows-1253': [ + 'cp1253', + 'x-cp1253' + ], + 'windows-1254': [ + 'cp1254', + 'csisolatin5', + 'iso-8859-9', + 'iso-ir-148', + 'iso8859-9', + 'iso88599', + 'iso_8859-9', + 'iso_8859-9:1989', + 'l5', + 'latin5', + 'x-cp1254' + ], + 'windows-1255': [ + 'cp1255', + 'x-cp1255' + ], + 'windows-1256': [ + 'cp1256', + 'x-cp1256' + ], + 'windows-1257': [ + 'cp1257', + 'x-cp1257' + ], + 'windows-1258': [ + 'cp1258', + 'x-cp1258' + ], + 'x-mac-cyrillic': [ + 'x-mac-ukrainian' + ], + 'gbk': [ + 'chinese', + 'csgb2312', + 'csiso58gb231280', + 'gb2312', + 'gb_2312', + 'gb_2312-80', + 'iso-ir-58', + 'x-gbk' + ], + 'gb18030': [ ], + 'big5': [ + 'big5-hkscs', + 'cn-big5', + 'csbig5', + 'x-x-big5' + ], + 'euc-jp': [ + 'cseucpkdfmtjapanese', + 'x-euc-jp' + ], + 'iso-2022-jp': [ + 'csiso2022jp' + ], + 'shift_jis': [ + 'csshiftjis', + 'ms932', + 'ms_kanji', + 'shift-jis', + 'sjis', + 'windows-31j', + 'x-sjis' + ], + 'euc-kr': [ + ' euc-kr \t', + 'EUC-kr \n', + 'cseuckr', + 'csksc56011987', + 'iso-ir-149', + 'korean', + 'ks_c_5601-1987', + 'ks_c_5601-1989', + 'ksc5601', + 'ksc_5601', + 'windows-949' + ] + }; + Object.entries(mappings).forEach((i) => { + const enc = i[0]; + const labels = i[1]; + assert.strictEqual(getEncodingFromLabel(enc), enc); + labels.forEach((l) => assert.strictEqual(getEncodingFromLabel(l), enc)); + }); + + assert.strictEqual(getEncodingFromLabel('made-up'), undefined); +} diff --git a/tools/icu/icu-generic.gyp b/tools/icu/icu-generic.gyp index 4c2125a0435b02..93d7cd5f6d9f39 100644 --- a/tools/icu/icu-generic.gyp +++ b/tools/icu/icu-generic.gyp @@ -30,15 +30,6 @@ 'type': 'none', 'toolsets': [ 'host', 'target' ], 'direct_dependent_settings': { - 'conditions': [ - [ 'icu_endianness == "l"', { - 'defines': [ - # ICU cannot swap the initial data without this. - # http://bugs.icu-project.org/trac/ticket/11046 - 'UCONFIG_NO_LEGACY_CONVERSION=1' - ], - }], - ], 'defines': [ 'UCONFIG_NO_SERVICE=1', 'UCONFIG_NO_REGULAR_EXPRESSIONS=1',