Skip to content

Commit

Permalink
Merge pull request #65 from lolengine/feature-unicode-bom
Browse files Browse the repository at this point in the history
Add utf8::bom and utf16::bom helper rules.
  • Loading branch information
d-frey authored Jul 19, 2017
2 parents b47152c + 6eb4d7c commit 51b1d4e
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 0 deletions.
14 changes: 14 additions & 0 deletions doc/Rule-Reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,12 @@ A unicode code point is considered *valid* when it is in the range `0` to `0x10f

* Equivalent to `seq< one< C1 >, one< C2 >, ... >`.

###### `bom`

* Succeeds when the input is not empty, and:
* The next 3 bytes are the UTF-8 encoding of character U+FEFF, byte order mark (BOM).
* Equivalent to `one< 0xfeff >`.

## UTF-16 Rules

These rules are in namespace `tao::pegtl::utf16`.
Expand Down Expand Up @@ -635,6 +641,12 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi

* Equivalent to `seq< one< C1 >, one< C2 >, ... >`.

###### `bom`

* Succeeds when the input is not empty, and:
* The next 2 bytes are the UTF-16 encoding of character U+FEFF, byte order mark (BOM).
* Equivalent to `one< 0xfeff >`.

## UTF-32 Rules

These rules are in namespace `tao::pegtl::utf32`.
Expand Down Expand Up @@ -708,6 +720,8 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi
* [`blank`](#blank) <sup>[(ascii rules)](#ascii-rules)</sup>
* [`bof`](#bof) <sup>[(atomic rules)](#atomic-rules)</sup>
* [`bol`](#bol) <sup>[(atomic rules)](#atomic-rules)</sup>
* [`bom`](#bom) <sup>[(utf-8 rules)](#utf-8-rules)</sup>
* [`bom`](#bom-1) <sup>[(utf-16 rules)](#utf-16-rules)</sup>
* [`bytes< Num >`](#bytes-num-) <sup>[(atomic rules)](#atomic-rules)</sup>
* [`control< C, R... >`](#control-c-r-) <sup>[(meta rules)](#meta-rules)</sup>
* [`digit`](#digit) <sup>[(ascii rules)](#ascii-rules)</sup>
Expand Down
1 change: 1 addition & 0 deletions include/tao/pegtl/utf16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace tao
template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf16, Lo, Hi > {};
template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf16, Cs... > {};
template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf16, Cs >... > {};
struct bom : one< 0xfeff > {};
// clang-format on

} // namespace utf16
Expand Down
1 change: 1 addition & 0 deletions include/tao/pegtl/utf8.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace tao
template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf8, Lo, Hi > {};
template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf8, Cs... > {};
template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf8, Cs >... > {};
struct bom : one< 0xfeff > {};
// clang-format on

} // namespace utf8
Expand Down
3 changes: 3 additions & 0 deletions src/test/pegtl/utf16_general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ namespace tao
verify_rule< utf16::one< 0x20ac > >( __LINE__, __FILE__, u16s( 0x20ac ), result_type::SUCCESS, 0 );
verify_rule< utf16::one< 0x10437 > >( __LINE__, __FILE__, u16s( 0xd801 ) + u16s( 0xdc37 ), result_type::SUCCESS, 0 );

verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfeff ), result_type::SUCCESS, 0 );
verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfffe ), result_type::LOCAL_FAILURE, 2 );

verify_rule< utf16::string< 0x20, 0x20ac, 0x10437 > >( __LINE__, __FILE__, u16s( 0x20 ) + u16s( 0x20ac ) + u16s( 0xd801 ) + u16s( 0xdc37 ) + u16s( 0x20 ), result_type::SUCCESS, 2 );
}

Expand Down
3 changes: 3 additions & 0 deletions src/test/pegtl/utf8_general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ namespace tao
verify_rule< utf8::one< 0x20ac > >( __LINE__, __FILE__, "\xe2\x82\xac", result_type::SUCCESS, 0 );
verify_rule< utf8::one< 0x10348 > >( __LINE__, __FILE__, "\xf0\x90\x8d\x88", result_type::SUCCESS, 0 );

verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf", result_type::SUCCESS, 0 );
verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf ", result_type::SUCCESS, 1 );

verify_rule< utf8::string< 0x20, 0xa2, 0x20ac, 0x10348 > >( __LINE__, __FILE__, "\x20\xc2\xa2\xe2\x82\xac\xf0\x90\x8d\x88\x20", result_type::SUCCESS, 1 );
}

Expand Down

0 comments on commit 51b1d4e

Please sign in to comment.