diff --git a/BENCHMARK.md b/BENCHMARK.md
index 8a71727..70d8678 100644
--- a/BENCHMARK.md
+++ b/BENCHMARK.md
@@ -1,6 +1,6 @@
Benchmark
-Benchmark run from 2023-11-18 14:03:41.157149Z UTC
+Benchmark run from 2023-11-23 17:53:11.749718Z UTC
## System
@@ -65,20 +65,20 @@ Run Time
Jason |
- 267.76 |
- 3.73 ms |
- ±14.12% |
- 3.45 ms |
- 4.99 ms |
+ 248.88 |
+ 4.02 ms |
+ ±14.75% |
+ 3.81 ms |
+ 5.72 ms |
Ymlr |
- 22.80 |
- 43.87 ms |
- ±2.48% |
- 43.77 ms |
- 48.11 ms |
+ 3.27 |
+ 305.72 ms |
+ ±1.14% |
+ 305.02 ms |
+ 317.72 ms |
@@ -93,14 +93,14 @@ Run Time Comparison
Slower |
Jason |
- 267.76 |
+ 248.88 |
|
Ymlr |
- 22.80 |
- 11.75x |
+ 3.27 |
+ 76.09x |
@@ -122,8 +122,8 @@ Memory Usage
Ymlr |
- 40.60 MB |
- 8.45x |
+ 68.18 MB |
+ 14.19x |
@@ -145,20 +145,20 @@ Run Time
Jason |
- 121.07 |
- 8.26 ms |
- ±14.07% |
- 7.82 ms |
- 13.76 ms |
+ 121.34 |
+ 8.24 ms |
+ ±142.36% |
+ 7.35 ms |
+ 13.61 ms |
Ymlr |
- 22.00 |
- 45.45 ms |
- ±19.87% |
- 45.58 ms |
- 59.34 ms |
+ 21.10 |
+ 47.39 ms |
+ ±62.18% |
+ 43.84 ms |
+ 319.16 ms |
@@ -173,14 +173,14 @@ Run Time Comparison
Slower |
Jason |
- 121.07 |
+ 121.34 |
|
Ymlr |
- 22.00 |
- 5.5x |
+ 21.10 |
+ 5.75x |
@@ -202,7 +202,7 @@ Memory Usage
Ymlr |
- 50.08 MB |
+ 50.09 MB |
5.43x |
@@ -225,20 +225,20 @@ Run Time
Jason |
- 349.85 |
- 2.86 ms |
- ±6.74% |
- 2.81 ms |
- 3.49 ms |
+ 368.32 |
+ 2.72 ms |
+ ±251.44% |
+ 2.57 ms |
+ 3.29 ms |
Ymlr |
- 28.33 |
- 35.30 ms |
- ±4.73% |
- 35.12 ms |
- 45.67 ms |
+ 4.76 |
+ 210.14 ms |
+ ±0.45% |
+ 209.83 ms |
+ 211.96 ms |
@@ -253,14 +253,14 @@ Run Time Comparison
Slower |
Jason |
- 349.85 |
+ 368.32 |
|
Ymlr |
- 28.33 |
- 12.35x |
+ 4.76 |
+ 77.4x |
@@ -282,7 +282,7 @@ Memory Usage
Ymlr |
- 47.48 MB |
- 18.67x |
+ 65.86 MB |
+ 25.89x |
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 971a4cc..3ebe587 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+### Added
+
+- Support for escape and unicode characters [#98](https://github.com/ufirstgroup/ymlr/pull/98)
+
### Added
diff --git a/lib/ymlr/encode.ex b/lib/ymlr/encode.ex
index 9c8ff17..a4c1ee5 100644
--- a/lib/ymlr/encode.ex
+++ b/lib/ymlr/encode.ex
@@ -1,5 +1,67 @@
defmodule Ymlr.Encode do
- @moduledoc false
+ @moduledoc ~S"""
+
+ This module implements the logic of encoding scalars.
+
+ ## Strings and Characters
+
+ ### Printable Characters
+
+ The YAML spec defines a set of printable characters `c-printable` (see
+ https://yaml.org/spec/1.2.2/#character-set). All these characters can
+ theoretically be left alone when encoding a string.
+
+ ### Escape Characters
+
+ The YAML spec also defines a set of escape charactesr `c-ns-esc-char` (see
+ https://yaml.org/spec/1.2.2/#57-escaped-characters). Some of these chars are
+ also in the printable range `c-printable`. Being in `c-printable` means they
+ could be left alone. I.e. there would be no need to encode them as escape
+ chars. However, we think in certain cases, escape characters are more
+ reader friendly than the actual characters. An example is the "next line"
+ character (`U+0085` or `\N`). It is part of `c-printable`. However, on the
+ screen this character cannot be distinguished from a simple "line feed"
+ character (`U+000A` or `\n`). Therefore all characters in `c-ns-esc-char` with
+ the exception of `\n` and `\t` are always encoded using their escape character.
+
+ ### Other 8-bit Unicode Characters
+
+ Any 8-bit unicode character that neither a printable nor an escape character
+ has to be encoded using one of the three unicode escape characters \x, \u or
+ \U (i.e. \xXX, \u00XX or \U000000XX).
+
+ ### Double Quotes for Escape Characters
+
+ Printable Characters can be encoded unquoted, single-quoted or double-quoted.
+ Escape characters require double quotes.
+
+ ### Chars with Special Treatments
+
+ #### Chars `\n` and `\t`
+
+ These two characters are never converted to their escape characters.
+ One exception: If the given string is literally just a newline, we
+ encode it as "\n" (double quotes required for escape chars) rather than a
+ single newline.
+
+ #### Chars `"` and `\`
+
+ These two characters have escape characters (`\"` and `\\`) but they are also
+ part of the of the printable character range `c-printable` and they have a
+ well-defined presentation on the screen. Ocurrance of these characters don't
+ enforce double-quotes but if they occur within a string that for other reasons
+ requires double-quotes, they need to be escaped.
+
+ ### Implemented Decision Logic
+
+ First matching rule is applied:
+
+ 1. Char is `\t` or `\n` => leave alone
+ 1. Char is `"` or `\` => if within double quotes, escape. Otherwise leave alone.
+ 1. Char has an escape character (i.e. is part of `c-ns-esc-char`) => force double quotes and encode as escape character
+ 1. Char is a printable character => leave alone
+ 1. Char is a non-printable character => force double quotes and encode as \xXX (only 8-bit supported for now)
+ """
alias Ymlr.Encoder
@@ -38,6 +100,51 @@ defmodule Ymlr.Encode do
":"
]
+ # Escape chars that, if contained within, force the string to be double-quoted:
+ @escape_chars_forcing_double_quotes ~c"\a\b\e\f\r\v\0\u00a0\u0085\u2028\u2029"
+
+ # Chars that have to be escaped if within double quotes:
+ @escape_if_within_double_quotes @escape_chars_forcing_double_quotes ++ ~c"\"\\"
+
+ # mapping char => escape char.
+ @escape_if_within_double_quotes_mapping Enum.zip(
+ @escape_if_within_double_quotes,
+ ~c"abefrv0_NLP\"\\"
+ )
+
+ # Printable Characters (8-bit only for now):
+ @printable_chars List.flatten([
+ # Tab (\t)
+ 0x09,
+ # Line feed (LF \n)
+ 0x0A,
+ # Carriage Return (CR \r)
+ # 0x0D, theoretically printable, seems to require double quotes.
+ # Next Line (NEL)
+ 0x85,
+ # Printable ASCII
+ Enum.to_list(0x20..0x7E),
+ # Basic Multilingual Plane (BMP)
+ Enum.to_list(0xA0..0xD7FF),
+ Enum.to_list(0xE000..0xFFFD),
+ # 32 bit
+ Enum.to_list(0x010000..0x10FFFF)
+ ])
+
+ @not_supported_by_elixir Enum.to_list(0xD800..0xDFFF)
+
+ # Non-Printable Characters (8-bit only for now) - all chars minus union of printable and escape chars:
+ @non_printable_chars Enum.to_list(0..0x10FFFF) --
+ (@printable_chars ++
+ @escape_if_within_double_quotes ++ @not_supported_by_elixir)
+
+ # Chars that, if contained within, force the string to be double-quoted:
+ @chars_forcing_double_quotes_strings Enum.map(
+ @non_printable_chars ++
+ @escape_chars_forcing_double_quotes,
+ &<<&1::utf8>>
+ )
+
@doc ~S"""
Encodes the given data as YAML string. Raises if it cannot be encoded.
@@ -146,6 +253,7 @@ defmodule Ymlr.Encode do
defp encode_binary(data, indent_level) do
cond do
data == "" -> ~S('')
+ data == "~" -> ~S('~')
data == "\n" -> ~S("\n")
data == "null" -> ~S('null')
data == "yes" -> ~S('yes')
@@ -155,7 +263,7 @@ defmodule Ymlr.Encode do
data == "True" -> ~S('True')
data == "False" -> ~S('False')
String.contains?(data, "\n") -> multiline(data, indent_level)
- String.contains?(data, "\t") -> ~s("#{data}")
+ String.contains?(data, @chars_forcing_double_quotes_strings) -> with_double_quotes(data)
String.at(data, 0) in @quote_when_first -> with_quotes(data)
String.at(data, -1) in @quote_when_last -> with_quotes(data)
String.starts_with?(data, "- ") -> with_quotes(data)
@@ -187,16 +295,41 @@ defmodule Ymlr.Encode do
defp with_quotes(data) do
if String.contains?(data, "'") do
- ~s("#{escape(data)}")
+ with_double_quotes(data)
else
- ~s('#{data}')
+ with_single_quotes(data)
end
end
+ defp with_double_quotes(data) do
+ ~s("#{escape(data)}")
+ end
+
+ defp with_single_quotes(data), do: ~s('#{data}')
+
defp escape(data) do
- data |> String.replace("\\", "\\\\") |> String.replace(~s("), ~s(\\"))
+ for <> do
+ escape_char(char)
+ end
+ end
+
+ for {char, escaped} <- @escape_if_within_double_quotes_mapping do
+ defp escape_char(unquote(char)), do: <\\, unquote(escaped)>>
end
+ for uchar <- @non_printable_chars do
+ unicode_sequence =
+ case uchar do
+ uchar when uchar <= 0xFF -> List.to_string(:io_lib.format("\\x~2.16.0B", [uchar]))
+ uchar when uchar <= 0xFFFF -> List.to_string(:io_lib.format("\\u~4.16.0B", [uchar]))
+ uchar -> List.to_string(:io_lib.format("\\U~6.16.0B", [uchar]))
+ end
+
+ defp escape_char(unquote(uchar)), do: unquote(unicode_sequence)
+ end
+
+ defp escape_char(char), do: char
+
# for example for map keys
defp multiline(data, nil), do: inspect(data)
# see https://yaml-multiline.info/
diff --git a/test/ymlr/encode_test.exs b/test/ymlr/encode_test.exs
index 5eeb9da..a198484 100644
--- a/test/ymlr/encode_test.exs
+++ b/test/ymlr/encode_test.exs
@@ -15,8 +15,9 @@ defmodule Ymlr.EncodeTest do
assert_identity_and_output("", "''")
end
- test "simple string" do
+ test "plain strings" do
assert_identity_and_output("hello world", "hello world")
+ assert_identity_and_output("that's it", "that's it")
end
# see http://blogs.perl.org/users/tinita/2018/03/strings-in-yaml---to-quote-or-not-to-quote.html
@@ -24,6 +25,7 @@ defmodule Ymlr.EncodeTest do
test "quoted strings - avoid type confusion" do
assert_identity_and_output("yes", ~S('yes'))
assert_identity_and_output("no", ~S('no'))
+ assert_identity_and_output("~", "'~'")
assert_identity_and_output("true", ~S('true'))
assert_identity_and_output("false", ~S('false'))
assert_identity_and_output("True", ~S('True'))
@@ -78,9 +80,9 @@ defmodule Ymlr.EncodeTest do
assert_identity_and_output("some:entry:", ~S('some:entry:'))
end
- test "quoted strings - escape seq forces double quotes (tab char)" do
- assert_identity_and_output("a\tb", ~s("a\tb"))
- assert_identity_and_output("!a\tb", ~s("!a\tb"))
+ test "quoted strings - tab char with and without quotes" do
+ assert_identity_and_output("a\tb", ~s(a\tb))
+ assert_identity_and_output("!a\tb", ~s('!a\tb'))
# Not for explicit backslash:
assert_identity_and_output(~S(!a\tb), ~S('!a\tb'))
end
@@ -89,7 +91,7 @@ defmodule Ymlr.EncodeTest do
# ... (prefer single quotes)
assert_identity_and_output("[]", ~S('[]'))
assert_identity_and_output(~S(["hello"]), ~S('["hello"]'))
- assert_identity_and_output(~S(["he|\o"]), ~S('["he|\o"]'))
+ assert_identity_and_output(~S(["he|\o"]), ~s('["he|\\o"]'))
assert_identity_and_output("{}", ~S('{}'))
assert_identity_and_output("[{}]", ~S('[{}]'))
# ... (use double quotes if string contains single quotes)
@@ -111,27 +113,75 @@ defmodule Ymlr.EncodeTest do
end
end
- @tag skip: "not sure about those => to be reviewed"
# https://yaml.org/spec/1.2.2/#example-escaped-characters
test "quoted strings - example-escaped-characters from 1.2.2 spec" do
- assert_identity_and_output("Fun with \\", ~S("Fun with \\"))
+ assert_identity_and_output(~S(Fun with \\), ~S(Fun with \\))
assert_identity_and_output("\" \u0007 \b \u001b \f", ~S("\" \a \b \e \f"))
- # assert_identity_and_output("\n \r \t \u000b \u0000", ~S("\n \r \t \v \0"))
- # or we use | when string contains newlines => rewrite the example to:
- assert_identity_and_output("\r \t \u000b \u0000", ~S("\r \t \v \0"))
- assert_identity_and_output("\u0020 \u00a0 \u0085 \u2028 \u2029", ~S("\ \_ \N \L \P"))
+ # Line breaks inside scalar content must be normalized by the YAML processor.
+ # Each such line break must be parsed into a single line feed character.
+ # The original line break format is a presentation detail and must not be
+ # used to convey content information.
+ # I.e. the following cannot be tested for identity as \r will be parsed as \n.
+ assert_output("\n\r \t \u000b \u0000", "|-\n\n \r \t \v \0")
+ assert_identity_and_output("\r \t \u000b \u0000", ~s("\\r \t \\v \\0"))
+
+ assert_identity_and_output(
+ "\u0020 \u00a0 \u0085 \u2028 \u2029",
+ ~S(" \_ \N \L \P")
+ )
+
+ # Possible formats: \x13 \u0013 \U00000013.
+ assert_identity_and_output(
+ "\u0013\uFFFD\uFFFE\u{10FFFF}",
+ "\"\\x13\uFFFD\\uFFFE\u{10FFFF}\""
+ )
end
- @tag skip: "not sure about those => review the spec"
test "quoted strings - in map key (requires escape char)" do
- assert_identity_and_output(%{"a\tb" => "value"}, ~s("a\tb": value))
- assert_identity_and_output(%{"a\rb" => "value"}, ~s("a\rb": value))
+ assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value))
+ assert_identity_and_output(%{"a\rb" => "value"}, ~s("a\\rb": value))
end
test "newline in map key" do
assert_identity_and_output(%{"a\nb" => "value"}, ~S("a\nb": value))
end
+ test "backslash" do
+ # in plain string
+ assert assert_identity_and_output(~S(a\b), ~S(a\b))
+ # in single quote string
+ assert assert_identity_and_output(~S(!a\b), ~S('!a\b'))
+ # double quotes because of single quote
+ assert assert_identity_and_output(~s(!a'b\\c), ~S("!a'b\\c"))
+ # double quotes because of tab
+ assert assert_identity_and_output(~s(a\tb\\c), ~s(a\tb\\c))
+ end
+
+ test "backslash in map key" do
+ # in plain string
+ assert assert_identity_and_output(%{~S(a\b) => "value"}, ~S(a\b: value))
+ # in single quote string
+ assert assert_identity_and_output(%{~S(!a\b) => "value"}, ~S('!a\b': value))
+ # double quotes because of single quote
+ assert assert_identity_and_output(%{~s(a'b\\c) => "value"}, ~s(a'b\\c: value))
+ # double quotes because of tab
+ assert assert_identity_and_output(%{~s(a\tb\\c) => "value"}, ~s(a\tb\\c: value))
+ end
+
+ test "tab" do
+ # would be plain string without the tab
+ assert assert_identity_and_output("a\tb", ~s(a\tb))
+ # would be single quoted string without the tab
+ assert assert_identity_and_output("!a\tb", ~s('!a\tb'))
+ end
+
+ test "tab in map key" do
+ # would be plain string without the tab
+ assert assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value))
+ # would be single quoted string without the tab
+ assert assert_identity_and_output(%{"!a\tb" => "value"}, ~s('!a\tb': value))
+ end
+
test "integers" do
assert_identity_and_output(1, "1")
end
@@ -275,6 +325,11 @@ defmodule Ymlr.EncodeTest do
end
# see https://yaml-multiline.info/
+ @tag skip: "still buggy"
+ test "multiline strings - starting with spaces" do
+ assert_identity_and_output("\n abc", "|-\n\n abc")
+ assert_identity_and_output(" abc\nabc", "|-\n abc\n abc")
+ end
test "multiline strings - base cases" do
assert_identity_and_output("a\n b\nc", "|-\n a\n b\n c")
@@ -406,6 +461,14 @@ defmodule Ymlr.EncodeTest do
})
end
+ test "tab(s) and newline(s) in the same string" do
+ assert_identity_and_output("a\tb\nc", "|-\n a\tb\n c")
+ # with extra whitespaces around the newline
+ assert_identity_and_output("a\tb \n c", "|-\n a\tb \n c")
+ # with backslash
+ assert_identity_and_output(~s(a\tb\nc\\w), "|-\n a\tb\n c\\w")
+ end
+
test "date" do
assert_output(~D[2016-05-24], "2016-05-24")
end