Add String#to_utf16 and String.from_utf16 (#5541)

crystal-lang · Jan 5, 2018 · 50b563c · RX14 · Jan 6, 2018 · asterite
1 parent 5bdd279
commit 50b563c
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 2 deletions.
diff --git a/spec/std/string/utf16_spec.cr b/spec/std/string/utf16_spec.cr
@@ -0,0 +1,47 @@
+require "spec"
+
+describe "String UTF16" do
+  describe "to_utf16" do
+    it "in the range U+0000..U+D7FF" do
+      encoded = "\u{0}hello\u{d7ff}".to_utf16
+      encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
+    end
+
+    it "in the range U+E000 to U+FFFF" do
+      encoded = "\u{e000}\u{ffff}".to_utf16
+      encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
+    end
+
+    it "in the range U+10000..U+10FFFF" do
+      encoded = "\u{10000}\u{10FFFF}".to_utf16
+      encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
+    end
+
+    it "in the range U+D800..U+DFFF" do
+      encoded = "\u{D800}\u{DFFF}".to_utf16
+      encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
+    end
+  end
+
+  describe "from_utf16" do
+    it "in the range U+0000..U+D7FF" do
+      input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
+      String.from_utf16(input).should eq("\u{0}hello\u{d7ff}")
+    end
+
+    it "in the range U+E000 to U+FFFF" do
+      input = Slice[0xe000_u16, 0xffff_u16]
+      String.from_utf16(input).should eq("\u{e000}\u{ffff}")
+    end
+
+    it "in the range U+10000..U+10FFFF" do
+      input = Slice[0xd800_u16, 0xdc00_u16]
+      String.from_utf16(input).should eq("\u{10000}")
+    end
+
+    it "in the range U+D800..U+DFFF" do
+      input = Slice[0xdc00_u16, 0xd800_u16]
+      String.from_utf16(input).should eq("\u{fffd}\u{fffd}")
+    end
+  end
+end
diff --git a/src/string.cr b/src/string.cr
@@ -4272,5 +4272,4 @@ class String
   end
 end
 
-require "./string/formatter"
-require "./string/builder"
+require "./string/*"
diff --git a/src/string/utf16.cr b/src/string/utf16.cr
@@ -0,0 +1,93 @@
+class String
+  # Returns the UTF-16 encoding of the given *string*.
+  #
+  # Invalid chars (in the range U+D800..U+DFFF) are encoded with the
+  # unicode replacement char value `0xfffd`.
+  #
+  # ```
+  # "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
+  # ```
+  def to_utf16 : Slice(UInt16)
+    size = 0
+    each_char do |char|
+      size += char.ord < 0x10000 ? 1 : 2
+    end
+
+    slice = Slice(UInt16).new(size)
+
+    i = 0
+    each_char do |char|
+      ord = char.ord
+      if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
+        # One UInt16 is enough
+        slice[i] = ord.to_u16
+      elsif ord >= 0x10000
+        # Needs surrogate pair
+        ord -= 0x10000
+        slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits
+        i += 1
+        slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits
+      else
+        # Invalid char: use replacement
+        slice[i] = 0xfffd_u16
+      end
+      i += 1
+    end
+
+    slice
+  end
+
+  # Decodes the given *slice* UTF-16 sequence into a String.
+  #
+  # Invalid values are encoded using the unicode replacement char with
+  # codepoint `0xfffd`.
+  #
+  # ```
+  # slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
+  # String.from_utf16(slice) # => "hi 𐂥"
+  # ```
+  def self.from_utf16(slice : Slice(UInt16)) : String
+    bytesize = 0
+    size = 0
+
+    each_utf16_char(slice) do |char|
+      bytesize += char.bytesize
+      size += 1
+    end
+
+    String.new(bytesize) do |buffer|
+      each_utf16_char(slice) do |char|
+        char.each_byte do |byte|
+          buffer.value = byte
+          buffer += 1
+        end
+      end
+      {bytesize, size}
+    end
+  end
+
+  # Yields each decoded char in the given slice.
+  private def self.each_utf16_char(slice : Slice(UInt16))
+    i = 0
+    while i < slice.size
+      byte = slice[i].to_i
+      if byte < 0xd800 || byte >= 0xe000
+        # One byte
+        codepoint = byte
+      elsif 0xd800 <= byte < 0xdc00 &&
+            (i + 1) < slice.size &&
+            0xdc00 <= slice[i + 1] <= 0xdfff
+        # Surrougate pair
+        codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000
+        i += 1
+      else
+        # Invalid byte
+        codepoint = 0xfffd
+      end
+
+      yield codepoint.chr
+
+      i += 1
+    end
+  end
+end