-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
require "spec" | ||
|
||
describe "String UTF16" do | ||
describe "to_utf16" do | ||
it "in the range U+0000..U+D7FF" do | ||
encoded = "\u{0}hello\u{d7ff}".to_utf16 | ||
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]) | ||
end | ||
|
||
it "in the range U+E000 to U+FFFF" do | ||
encoded = "\u{e000}\u{ffff}".to_utf16 | ||
encoded.should eq(Slice[0xe000_u16, 0xffff_u16]) | ||
end | ||
|
||
it "in the range U+10000..U+10FFFF" do | ||
encoded = "\u{10000}\u{10FFFF}".to_utf16 | ||
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16]) | ||
end | ||
|
||
it "in the range U+D800..U+DFFF" do | ||
encoded = "\u{D800}\u{DFFF}".to_utf16 | ||
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16]) | ||
end | ||
end | ||
|
||
describe "from_utf16" do | ||
it "in the range U+0000..U+D7FF" do | ||
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16] | ||
String.from_utf16(input).should eq("\u{0}hello\u{d7ff}") | ||
end | ||
|
||
it "in the range U+E000 to U+FFFF" do | ||
input = Slice[0xe000_u16, 0xffff_u16] | ||
String.from_utf16(input).should eq("\u{e000}\u{ffff}") | ||
end | ||
|
||
it "in the range U+10000..U+10FFFF" do | ||
input = Slice[0xd800_u16, 0xdc00_u16] | ||
String.from_utf16(input).should eq("\u{10000}") | ||
end | ||
|
||
it "in the range U+D800..U+DFFF" do | ||
input = Slice[0xdc00_u16, 0xd800_u16] | ||
String.from_utf16(input).should eq("\u{fffd}\u{fffd}") | ||
end | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4272,5 +4272,4 @@ class String | |
end | ||
end | ||
|
||
require "./string/formatter" | ||
require "./string/builder" | ||
require "./string/*" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
class String | ||
# Returns the UTF-16 encoding of the given *string*. | ||
# | ||
# Invalid chars (in the range U+D800..U+DFFF) are encoded with the | ||
# unicode replacement char value `0xfffd`. | ||
# | ||
# ``` | ||
# "hi 𐂥".to_utf16 # => Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] | ||
# ``` | ||
def to_utf16 : Slice(UInt16) | ||
size = 0 | ||
each_char do |char| | ||
size += char.ord < 0x10000 ? 1 : 2 | ||
end | ||
|
||
slice = Slice(UInt16).new(size) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
asterite
Author
Member
|
||
|
||
i = 0 | ||
each_char do |char| | ||
ord = char.ord | ||
if ord <= 0xd800 || (0xe000 <= ord < 0x10000) | ||
# One UInt16 is enough | ||
slice[i] = ord.to_u16 | ||
elsif ord >= 0x10000 | ||
# Needs surrogate pair | ||
ord -= 0x10000 | ||
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bits | ||
i += 1 | ||
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bits | ||
else | ||
# Invalid char: use replacement | ||
slice[i] = 0xfffd_u16 | ||
end | ||
i += 1 | ||
end | ||
|
||
slice | ||
end | ||
|
||
# Decodes the given *slice* UTF-16 sequence into a String. | ||
# | ||
# Invalid values are encoded using the unicode replacement char with | ||
# codepoint `0xfffd`. | ||
# | ||
# ``` | ||
# slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16] | ||
# String.from_utf16(slice) # => "hi 𐂥" | ||
# ``` | ||
def self.from_utf16(slice : Slice(UInt16)) : String | ||
bytesize = 0 | ||
size = 0 | ||
|
||
each_utf16_char(slice) do |char| | ||
bytesize += char.bytesize | ||
size += 1 | ||
end | ||
|
||
String.new(bytesize) do |buffer| | ||
each_utf16_char(slice) do |char| | ||
char.each_byte do |byte| | ||
buffer.value = byte | ||
buffer += 1 | ||
end | ||
end | ||
{bytesize, size} | ||
end | ||
end | ||
|
||
# Yields each decoded char in the given slice. | ||
private def self.each_utf16_char(slice : Slice(UInt16)) | ||
i = 0 | ||
while i < slice.size | ||
byte = slice[i].to_i | ||
if byte < 0xd800 || byte >= 0xe000 | ||
# One byte | ||
codepoint = byte | ||
elsif 0xd800 <= byte < 0xdc00 && | ||
(i + 1) < slice.size && | ||
0xdc00 <= slice[i + 1] <= 0xdfff | ||
# Surrougate pair | ||
codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000 | ||
i += 1 | ||
else | ||
# Invalid byte | ||
codepoint = 0xfffd | ||
end | ||
|
||
yield codepoint.chr | ||
|
||
i += 1 | ||
end | ||
end | ||
end |
I just realised: we don't make sure that the
Slice
ends with a null byte. Should we make this guarantee for all slices, or just do it manually in this method?