Skip to content

Commit

Permalink
feat(encoding): support BOM detection with test passed (#6074)
Browse files Browse the repository at this point in the history
  • Loading branch information
WingLim authored Oct 3, 2023
1 parent 4f1710d commit 476fa4d
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 9 deletions.
18 changes: 17 additions & 1 deletion src/bun.js/bindings/ZigGeneratedClasses.cpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions src/bun.js/bindings/generated_classes.zig
Original file line number Diff line number Diff line change
Expand Up @@ -6874,13 +6874,17 @@ pub const JSTextDecoder = struct {
if (@TypeOf(TextDecoder.getFatal) != GetterType)
@compileLog("Expected TextDecoder.getFatal to be a getter");

if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType)
@compileLog("Expected TextDecoder.getIgnoreBOM to be a getter");

if (!JSC.is_bindgen) {
@export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" });
@export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" });
@export(TextDecoder.decodeWithoutTypeChecks, .{ .name = "TextDecoderPrototype__decodeWithoutTypeChecks" });
@export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" });
@export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" });
@export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" });
@export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" });
}
}
};
Expand Down
3 changes: 3 additions & 0 deletions src/bun.js/webcore/encoding.classes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ export default [
fatal: {
getter: "getFatal",
},
ignoreBOM: {
getter: "getIgnoreBOM",
},

decode: {
fn: "decode",
Expand Down
29 changes: 23 additions & 6 deletions src/bun.js/webcore/encoding.zig
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,13 @@ pub const TextDecoder = struct {
remainder = remainder[1..];
continue;
},
// BOM handling
0xFEFF => {
buffer.ensureTotalCapacity(allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = remainder[0];
buffer.items.len += 1;
remainder = remainder[1..];
},

// Is this an unpaired low surrogate or four-digit hex escape?
else => {
Expand Down Expand Up @@ -629,8 +636,13 @@ pub const TextDecoder = struct {
},
EncodingLabel.@"UTF-8" => {
const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim;
const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3]))
buffer_slice[3..]
else
buffer_slice;

if (this.fatal) {
if (toUTF16(default_allocator, buffer_slice, true)) |result_| {
if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
}
Expand All @@ -649,7 +661,7 @@ pub const TextDecoder = struct {
}
}
} else {
if (toUTF16(default_allocator, buffer_slice, false)) |result_| {
if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
}
Expand All @@ -664,15 +676,20 @@ pub const TextDecoder = struct {
}

// Experiment: using mimalloc directly is slightly slower
return ZigString.init(buffer_slice).toValueGC(globalThis);
return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis);
},

EncodingLabel.@"UTF-16LE" => {
if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) {
return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis);
const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2]))
buffer_slice[2..]
else
buffer_slice;

if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) {
return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis);
}

return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis);
return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis);
},
else => {
globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
Expand Down
24 changes: 23 additions & 1 deletion test/js/web/encoding/text-decoder.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ describe("TextDecoder", () => {
it("constructor should set values", () => {
const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false });
expect(decoder.fatal).toBe(true);
// expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined
expect(decoder.ignoreBOM).toBe(false);
});

it("should throw on invalid input", () => {
Expand All @@ -265,6 +265,28 @@ describe("TextDecoder", () => {
});
});

describe("TextDecoder ignoreBOM", () => {
it.each([
{
encoding: "utf-8",
bytes: [0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63],
},
{
encoding: "utf-16le",
bytes: [0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00],
},
])("should ignoreBOM for: %o", ({ encoding, bytes }) => {
const BOM = "\uFEFF";
const array = new Uint8Array(bytes);

const decoder_ignore_bom = new TextDecoder(encoding, { ignoreBOM: true });
expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`);

const decoder_not_ignore_bom = new TextDecoder(encoding, { ignoreBOM: false });
expect(decoder_not_ignore_bom.decode(array)).toStrictEqual("abc");
});
});

it("truncated sequences", () => {
const assert_equals = (a, b) => expect(a).toBe(b);

Expand Down
2 changes: 1 addition & 1 deletion test/js/web/encoding/text-encoder.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ describe("TextEncoder", () => {
const fixture = new Uint8Array(await Bun.file(import.meta.dir + "/utf8-encoding-fixture.bin").arrayBuffer());
const length = 0x110000;
let textEncoder = new TextEncoder();
let textDecoder = new TextDecoder();
let textDecoder = new TextDecoder("utf-8", { ignoreBOM: true });
let encodeOut = new Uint8Array(length * 4);
let encodeIntoOut = new Uint8Array(length * 4);
let encodeIntoBuffer = new Uint8Array(4);
Expand Down

0 comments on commit 476fa4d

Please sign in to comment.