From 6eb4d7c6170bbe28a56be79e52ca10a55c79e1e9 Mon Sep 17 00:00:00 2001 From: Sam Hocevar Date: Wed, 19 Jul 2017 14:52:57 +0200 Subject: [PATCH] Add utf8::bom and utf16::bom helper rules. --- doc/Rule-Reference.md | 14 ++++++++++++++ include/tao/pegtl/utf16.hpp | 1 + include/tao/pegtl/utf8.hpp | 1 + src/test/pegtl/utf16_general.cpp | 3 +++ src/test/pegtl/utf8_general.cpp | 3 +++ 5 files changed, 22 insertions(+) diff --git a/doc/Rule-Reference.md b/doc/Rule-Reference.md index 1b6a26a42..3dce6906d 100644 --- a/doc/Rule-Reference.md +++ b/doc/Rule-Reference.md @@ -573,6 +573,12 @@ A unicode code point is considered *valid* when it is in the range `0` to `0x10f * Equivalent to `seq< one< C1 >, one< C2 >, ... >`. +###### `bom` + +* Succeeds when the input is not empty, and: +* The next 3 bytes are the UTF-8 encoding of character U+FEFF, byte order mark (BOM). +* Equivalent to `one< 0xfeff >`. + ## UTF-16 Rules These rules are in namespace `tao::pegtl::utf16`. @@ -635,6 +641,12 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi * Equivalent to `seq< one< C1 >, one< C2 >, ... >`. +###### `bom` + +* Succeeds when the input is not empty, and: +* The next 2 bytes are the UTF-16 encoding of character U+FEFF, byte order mark (BOM). +* Equivalent to `one< 0xfeff >`. + ## UTF-32 Rules These rules are in namespace `tao::pegtl::utf32`. @@ -708,6 +720,8 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi * [`blank`](#blank) [(ascii rules)](#ascii-rules) * [`bof`](#bof) [(atomic rules)](#atomic-rules) * [`bol`](#bol) [(atomic rules)](#atomic-rules) +* [`bom`](#bom) [(utf-8 rules)](#utf-8-rules) +* [`bom`](#bom-1) [(utf-16 rules)](#utf-16-rules) * [`bytes< Num >`](#bytes-num-) [(atomic rules)](#atomic-rules) * [`control< C, R... >`](#control-c-r-) [(meta rules)](#meta-rules) * [`digit`](#digit) [(ascii rules)](#ascii-rules) diff --git a/include/tao/pegtl/utf16.hpp b/include/tao/pegtl/utf16.hpp index faa7180c8..a3bd61215 100644 --- a/include/tao/pegtl/utf16.hpp +++ b/include/tao/pegtl/utf16.hpp @@ -24,6 +24,7 @@ namespace tao template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf16, Lo, Hi > {}; template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf16, Cs... > {}; template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf16, Cs >... > {}; + struct bom : one< 0xfeff > {}; // clang-format on } // namespace utf16 diff --git a/include/tao/pegtl/utf8.hpp b/include/tao/pegtl/utf8.hpp index 9ec046eb2..4813f4e1d 100644 --- a/include/tao/pegtl/utf8.hpp +++ b/include/tao/pegtl/utf8.hpp @@ -24,6 +24,7 @@ namespace tao template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf8, Lo, Hi > {}; template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf8, Cs... > {}; template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf8, Cs >... > {}; + struct bom : one< 0xfeff > {}; // clang-format on } // namespace utf8 diff --git a/src/test/pegtl/utf16_general.cpp b/src/test/pegtl/utf16_general.cpp index 931859bbb..a1e2b7c96 100644 --- a/src/test/pegtl/utf16_general.cpp +++ b/src/test/pegtl/utf16_general.cpp @@ -49,6 +49,9 @@ namespace tao verify_rule< utf16::one< 0x20ac > >( __LINE__, __FILE__, u16s( 0x20ac ), result_type::SUCCESS, 0 ); verify_rule< utf16::one< 0x10437 > >( __LINE__, __FILE__, u16s( 0xd801 ) + u16s( 0xdc37 ), result_type::SUCCESS, 0 ); + verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfeff ), result_type::SUCCESS, 0 ); + verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfffe ), result_type::LOCAL_FAILURE, 2 ); + verify_rule< utf16::string< 0x20, 0x20ac, 0x10437 > >( __LINE__, __FILE__, u16s( 0x20 ) + u16s( 0x20ac ) + u16s( 0xd801 ) + u16s( 0xdc37 ) + u16s( 0x20 ), result_type::SUCCESS, 2 ); } diff --git a/src/test/pegtl/utf8_general.cpp b/src/test/pegtl/utf8_general.cpp index 2caad339e..6091b8d72 100644 --- a/src/test/pegtl/utf8_general.cpp +++ b/src/test/pegtl/utf8_general.cpp @@ -252,6 +252,9 @@ namespace tao verify_rule< utf8::one< 0x20ac > >( __LINE__, __FILE__, "\xe2\x82\xac", result_type::SUCCESS, 0 ); verify_rule< utf8::one< 0x10348 > >( __LINE__, __FILE__, "\xf0\x90\x8d\x88", result_type::SUCCESS, 0 ); + verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf", result_type::SUCCESS, 0 ); + verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf ", result_type::SUCCESS, 1 ); + verify_rule< utf8::string< 0x20, 0xa2, 0x20ac, 0x10348 > >( __LINE__, __FILE__, "\x20\xc2\xa2\xe2\x82\xac\xf0\x90\x8d\x88\x20", result_type::SUCCESS, 1 ); }