lib/internal/data_url.js

'use strict';

const {
  RegExpPrototypeExec,
  RegExpPrototypeSymbolReplace,
  StringFromCharCodeApply,
  StringPrototypeCharCodeAt,
  StringPrototypeIndexOf,
  StringPrototypeSlice,
  TypedArrayPrototypeSubarray,
  Uint8Array,
} = primordials;

const assert = require('internal/assert');
const { Buffer } = require('buffer');
const { MIMEType } = require('internal/mime');

let encoder;
function lazyEncoder() {
  if (encoder === undefined) {
    const { TextEncoder } = require('internal/encoding');
    encoder = new TextEncoder();
  }

  return encoder;
}

const ASCII_WHITESPACE_REPLACE_REGEX = /[\u0009\u000A\u000C\u000D\u0020]/g // eslint-disable-line

// https://fetch.spec.whatwg.org/#data-url-processor
/** @param {URL} dataURL */
function dataURLProcessor(dataURL) {
  // 1. Assert: dataURL's scheme is "data".
  assert(dataURL.protocol === 'data:');

  // 2. Let input be the result of running the URL
  // serializer on dataURL with exclude fragment
  // set to true.
  let input = URLSerializer(dataURL, true);

  // 3. Remove the leading "data:" string from input.
  input = StringPrototypeSlice(input, 5);

  // 4. Let position point at the start of input.
  const position = { position: 0 };

  // 5. Let mimeType be the result of collecting a
  // sequence of code points that are not equal
  // to U+002C (,), given position.
  let mimeType = collectASequenceOfCodePointsFast(
    ',',
    input,
    position,
  );

  // 6. Strip leading and trailing ASCII whitespace
  // from mimeType.
  // Undici implementation note: we need to store the
  // length because if the mimetype has spaces removed,
  // the wrong amount will be sliced from the input in
  // step #9
  const mimeTypeLength = mimeType.length;
  mimeType = removeASCIIWhitespace(mimeType, true, true);

  // 7. If position is past the end of input, then
  // return failure
  if (position.position >= input.length) {
    return 'failure';
  }

  // 8. Advance position by 1.
  position.position++;

  // 9. Let encodedBody be the remainder of input.
  const encodedBody = StringPrototypeSlice(input, mimeTypeLength + 1);

  // 10. Let body be the percent-decoding of encodedBody.
  let body = stringPercentDecode(encodedBody);

  // 11. If mimeType ends with U+003B (;), followed by
  // zero or more U+0020 SPACE, followed by an ASCII
  // case-insensitive match for "base64", then:
  if (RegExpPrototypeExec(/;(\u0020){0,}base64$/i, mimeType) !== null) {
    // 1. Let stringBody be the isomorphic decode of body.
    const stringBody = isomorphicDecode(body);

    // 2. Set body to the forgiving-base64 decode of
    // stringBody.
    body = forgivingBase64(stringBody);

    // 3. If body is failure, then return failure.
    if (body === 'failure') {
      return 'failure';
    }

    // 4. Remove the last 6 code points from mimeType.
    mimeType = StringPrototypeSlice(mimeType, 0, -6);

    // 5. Remove trailing U+0020 SPACE code points from mimeType,
    // if any.
    mimeType = RegExpPrototypeSymbolReplace(/(\u0020)+$/, mimeType, '');

    // 6. Remove the last U+003B (;) code point from mimeType.
    mimeType = StringPrototypeSlice(mimeType, 0, -1);
  }

  // 12. If mimeType starts with U+003B (;), then prepend
  // "text/plain" to mimeType.
  if (mimeType[0] === ';') {
    mimeType = 'text/plain' + mimeType;
  }

  // 13. Let mimeTypeRecord be the result of parsing
  // mimeType.
  // 14. If mimeTypeRecord is failure, then set
  // mimeTypeRecord to text/plain;charset=US-ASCII.
  let mimeTypeRecord;

  try {
    mimeTypeRecord = new MIMEType(mimeType);
  } catch {
    mimeTypeRecord = new MIMEType('text/plain;charset=US-ASCII');
  }

  // 15. Return a new data: URL struct whose MIME
  // type is mimeTypeRecord and body is body.
  // https://fetch.spec.whatwg.org/#data-url-struct
  return { mimeType: mimeTypeRecord, body };
}

// https://url.spec.whatwg.org/#concept-url-serializer
/**
 * @param {URL} url
 * @param {boolean} excludeFragment
 */
function URLSerializer(url, excludeFragment = false) {
  const { href } = url;

  if (!excludeFragment) {
    return href;
  }

  const hashLength = url.hash.length;
  const serialized = hashLength === 0 ? href : StringPrototypeSlice(href, 0, href.length - hashLength);

  if (!hashLength && href[href.length - 1] === '#') {
    return StringPrototypeSlice(serialized, 0, -1);
  }

  return serialized;
}

/**
 * A faster collectASequenceOfCodePoints that only works when comparing a single character.
 * @param {string} char
 * @param {string} input
 * @param {{ position: number }} position
 */
function collectASequenceOfCodePointsFast(char, input, position) {
  const idx = StringPrototypeIndexOf(input, char, position.position);
  const start = position.position;

  if (idx === -1) {
    position.position = input.length;
    return StringPrototypeSlice(input, start);
  }

  position.position = idx;
  return StringPrototypeSlice(input, start, position.position);
}

// https://url.spec.whatwg.org/#string-percent-decode
/** @param {string} input */
function stringPercentDecode(input) {
  // 1. Let bytes be the UTF-8 encoding of input.
  const bytes = lazyEncoder().encode(input);

  // 2. Return the percent-decoding of bytes.
  return percentDecode(bytes);
}

/**
 * @param {number} byte
 */
function isHexCharByte(byte) {
  // 0-9 A-F a-f
  return (byte >= 0x30 && byte <= 0x39) || (byte >= 0x41 && byte <= 0x46) || (byte >= 0x61 && byte <= 0x66);
}

/**
 * @param {number} byte
 */
function hexByteToNumber(byte) {
  return (
    // 0-9
    byte >= 0x30 && byte <= 0x39 ?
      (byte - 48) :
    // Convert to uppercase
    // ((byte & 0xDF) - 65) + 10
      ((byte & 0xDF) - 55)
  );
}

// https://url.spec.whatwg.org/#percent-decode
/** @param {Uint8Array} input */
function percentDecode(input) {
  const length = input.length;
  // 1. Let output be an empty byte sequence.
  /** @type {Uint8Array} */
  const output = new Uint8Array(length);
  let j = 0;
  // 2. For each byte byte in input:
  for (let i = 0; i < length; ++i) {
    const byte = input[i];

    // 1. If byte is not 0x25 (%), then append byte to output.
    if (byte !== 0x25) {
      output[j++] = byte;

    // 2. Otherwise, if byte is 0x25 (%) and the next two bytes
    // after byte in input are not in the ranges
    // 0x30 (0) to 0x39 (9), 0x41 (A) to 0x46 (F),
    // and 0x61 (a) to 0x66 (f), all inclusive, append byte
    // to output.
    } else if (
      byte === 0x25 &&
      !(isHexCharByte(input[i + 1]) && isHexCharByte(input[i + 2]))
    ) {
      output[j++] = 0x25;

    // 3. Otherwise:
    } else {
      // 1. Let bytePoint be the two bytes after byte in input,
      // decoded, and then interpreted as hexadecimal number.
      // 2. Append a byte whose value is bytePoint to output.
      output[j++] = (hexByteToNumber(input[i + 1]) << 4) | hexByteToNumber(input[i + 2]);

      // 3. Skip the next two bytes in input.
      i += 2;
    }
  }

  // 3. Return output.
  return length === j ? output : TypedArrayPrototypeSubarray(output, 0, j);
}

// https://infra.spec.whatwg.org/#forgiving-base64-decode
/** @param {string} data */
function forgivingBase64(data) {
  // 1. Remove all ASCII whitespace from data.
  data = RegExpPrototypeSymbolReplace(ASCII_WHITESPACE_REPLACE_REGEX, data, '');

  let dataLength = data.length;
  // 2. If data's code point length divides by 4 leaving
  // no remainder, then:
  if (dataLength % 4 === 0) {
    // 1. If data ends with one or two U+003D (=) code points,
    // then remove them from data.
    if (data[dataLength - 1] === '=') {
      --dataLength;
      if (data[dataLength - 1] === '=') {
        --dataLength;
      }
    }
  }

  // 3. If data's code point length divides by 4 leaving
  // a remainder of 1, then return failure.
  if (dataLength % 4 === 1) {
    return 'failure';
  }

  // 4. If data contains a code point that is not one of
  //  U+002B (+)
  //  U+002F (/)
  //  ASCII alphanumeric
  // then return failure.
  if (RegExpPrototypeExec(/[^+/0-9A-Za-z]/, data.length === dataLength ? data : StringPrototypeSlice(data, 0, dataLength)) !== null) {
    return 'failure';
  }

  const buffer = Buffer.from(data, 'base64');
  return new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
}

/**
 * @see https://infra.spec.whatwg.org/#ascii-whitespace
 * @param {number} char
 */
function isASCIIWhitespace(char) {
  // "\r\n\t\f "
  return char === 0x00d || char === 0x00a || char === 0x009 || char === 0x00c || char === 0x020;
}

/**
 * @see https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace
 * @param {string} str
 * @param {boolean} [leading=true]
 * @param {boolean} [trailing=true]
 */
function removeASCIIWhitespace(str, leading = true, trailing = true) {
  return removeChars(str, leading, trailing, isASCIIWhitespace);
}

/**
 * @param {string} str
 * @param {boolean} leading
 * @param {boolean} trailing
 * @param {(charCode: number) => boolean} predicate
 */
function removeChars(str, leading, trailing, predicate) {
  let lead = 0;
  let trail = str.length - 1;

  if (leading) {
    while (lead < str.length && predicate(StringPrototypeCharCodeAt(str, lead))) lead++;
  }

  if (trailing) {
    while (trail > 0 && predicate(StringPrototypeCharCodeAt(str, trail))) trail--;
  }

  return lead === 0 && trail === str.length - 1 ? str : StringPrototypeSlice(str, lead, trail + 1);
}

/**
 * @see https://infra.spec.whatwg.org/#isomorphic-decode
 * @param {Uint8Array} input
 * @returns {string}
 */
function isomorphicDecode(input) {
  // 1. To isomorphic decode a byte sequence input, return a string whose code point
  //    length is equal to input's length and whose code points have the same values
  //    as the values of input's bytes, in the same order.
  const length = input.length;
  if ((2 << 15) - 1 > length) {
    return StringFromCharCodeApply(input);
  }
  let result = ''; let i = 0;
  let addition = (2 << 15) - 1;
  while (i < length) {
    if (i + addition > length) {
      addition = length - i;
    }
    result += StringFromCharCodeApply(TypedArrayPrototypeSubarray(input, i, i += addition));
  }
  return result;
}

module.exports = {
  dataURLProcessor,
};