Skip to content

Commit

Permalink
add support for special unicode chars
Browse files Browse the repository at this point in the history
  • Loading branch information
mruoss committed Nov 17, 2023
1 parent 42c7f64 commit abb41dd
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 21 deletions.
33 changes: 28 additions & 5 deletions lib/ymlr/encode.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ defmodule Ymlr.Encode do
":"
]

@escape_chars ~c"\b\f\r\v\0\"\\"
@escape_char_mapping Enum.zip(@escape_chars, ~c"bfrv0\"\\")
@unicode_char_mapping (Enum.to_list(0x00..0x1F) ++ Enum.to_list(0x7F..0xFF)) |> Enum.reject(&Kernel.in(&1, '\n\t' ++ @escape_chars))
@require_double_quotes Enum.map(~c"\b\f\r\v\0"++@unicode_char_mapping, &(<< &1 >>)) |>dbg

@doc ~S"""
Encodes the given data as YAML string. Raises if it cannot be encoded.
Expand Down Expand Up @@ -146,6 +151,7 @@ defmodule Ymlr.Encode do
defp encode_binary(data, indent_level) do
cond do
data == "" -> ~S('')
data == "~" -> ~S('~')
data == "\n" -> ~S("\n")
data == "null" -> ~S('null')
data == "yes" -> ~S('yes')
Expand All @@ -155,6 +161,7 @@ defmodule Ymlr.Encode do
data == "True" -> ~S('True')
data == "False" -> ~S('False')
String.contains?(data, "\n") -> multiline(data, indent_level)
String.contains?(data, @require_double_quotes) -> with_double_quotes(data)
String.at(data, 0) in @quote_when_first -> with_quotes(data)
String.at(data, -1) in @quote_when_last -> with_quotes(data)
String.starts_with?(data, "- ") -> with_quotes(data)
Expand Down Expand Up @@ -186,18 +193,34 @@ defmodule Ymlr.Encode do

defp with_quotes(data) do
if String.contains?(data, "'") do
~s("#{escape(data)}")
with_double_quotes(data)
else
~s('#{data}')
with_single_quotes(data)
end
end
defp with_double_quotes(data) do
~s("#{escape(data)}")
end

defp with_single_quotes(data), do: ~s('#{data}')

defp escape(data) do
data
|> String.replace("\\", "\\\\")
|> String.replace(~S("), ~S(\"))
for << char::utf8 <- data >> do
escape_char(char)
end
end

for {char, escaped} <- @escape_char_mapping do
defp escape_char(unquote(char)), do: << ?\\, unquote(escaped) >>
end

for uchar <- @unicode_char_mapping do
unicode_sequence = List.to_string(:io_lib.format("\\u~4.16.0B", [uchar]))
defp escape_char(unquote(uchar)), do: unquote(unicode_sequence)
end

defp escape_char(char), do: char

# for example for map keys
defp multiline(data, nil), do: inspect(data)
# see https://yaml-multiline.info/
Expand Down
24 changes: 8 additions & 16 deletions test/ymlr/encode_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ defmodule Ymlr.EncodeTest do
test "quoted strings - avoid type confusion" do
assert_identity_and_output("yes", ~S('yes'))
assert_identity_and_output("no", ~S('no'))
assert_identity_and_output("~", "'~'")
assert_identity_and_output("true", ~S('true'))
assert_identity_and_output("false", ~S('false'))
assert_identity_and_output("True", ~S('True'))
Expand Down Expand Up @@ -115,27 +116,18 @@ defmodule Ymlr.EncodeTest do
# https://yaml.org/spec/1.2.2/#example-escaped-characters
test "quoted strings - example-escaped-characters from 1.2.2 spec" do
assert_identity_and_output("Fun with \\", "Fun with \\")
end

test "quoted strings - in map key (requires escape char)" do
assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value))
end

@tag skip: "Identity test fails"
test "Special bytes" do
assert_identity_and_output(%{"a\rb" => "value"}, ~s(a\rb: value))
assert_identity_and_output("\n \r \t \u000b \u0000", "|-\n\n \r \t \v \0")

assert_identity_and_output("\r \t \u000b \u0000", "\"\\r \t \\v \\0\"")
assert_identity_and_output(
"\u0020 \u00a0 \u0085 \u2028 \u2029",
<<32, 32, 194, 160, 32, 194, 133, 32, 226, 128, 168, 32, 226, 128, 169>>
"\" \\u00A0 \\u0085 \u2028 \u2029\""
)
assert_identity_and_output("\" \u0007 \b \u001b \f", "\"\\\" \\u0007 \\b \\u001B \\f\"")
assert_identity_and_output("\r \t \u000b \u0000", "\"\\r \t \\v \\0\"")
end

@tag skip: "YamlElixir throws a parsing Error"
test "Special bytes 2" do
assert_identity_and_output("\" \u0007 \b \u001b \f", "'\" \a \b \e \f'")
assert_identity_and_output("\r \t \u000b \u0000", "'\r \t \v \0'")
test "quoted strings - in map key (requires escape char)" do
assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value))
assert_identity_and_output(%{"a\rb" => "value"}, ~s("a\\rb": value))
end

test "newline in map key" do
Expand Down

0 comments on commit abb41dd

Please sign in to comment.