diff --git a/doc/Rule-Reference.md b/doc/Rule-Reference.md
index 1b6a26a42..3dce6906d 100644
--- a/doc/Rule-Reference.md
+++ b/doc/Rule-Reference.md
@@ -573,6 +573,12 @@ A unicode code point is considered *valid* when it is in the range `0` to `0x10f
* Equivalent to `seq< one< C1 >, one< C2 >, ... >`.
+###### `bom`
+
+* Succeeds when the input is not empty, and:
+* The next 3 bytes are the UTF-8 encoding of character U+FEFF, byte order mark (BOM).
+* Equivalent to `one< 0xfeff >`.
+
## UTF-16 Rules
These rules are in namespace `tao::pegtl::utf16`.
@@ -635,6 +641,12 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi
* Equivalent to `seq< one< C1 >, one< C2 >, ... >`.
+###### `bom`
+
+* Succeeds when the input is not empty, and:
+* The next 2 bytes are the UTF-16 encoding of character U+FEFF, byte order mark (BOM).
+* Equivalent to `one< 0xfeff >`.
+
## UTF-32 Rules
These rules are in namespace `tao::pegtl::utf32`.
@@ -708,6 +720,8 @@ Unaligned memory is no problem on x86 compatible processors; on some other archi
* [`blank`](#blank) [(ascii rules)](#ascii-rules)
* [`bof`](#bof) [(atomic rules)](#atomic-rules)
* [`bol`](#bol) [(atomic rules)](#atomic-rules)
+* [`bom`](#bom) [(utf-8 rules)](#utf-8-rules)
+* [`bom`](#bom-1) [(utf-16 rules)](#utf-16-rules)
* [`bytes< Num >`](#bytes-num-) [(atomic rules)](#atomic-rules)
* [`control< C, R... >`](#control-c-r-) [(meta rules)](#meta-rules)
* [`digit`](#digit) [(ascii rules)](#ascii-rules)
diff --git a/include/tao/pegtl/utf16.hpp b/include/tao/pegtl/utf16.hpp
index faa7180c8..a3bd61215 100644
--- a/include/tao/pegtl/utf16.hpp
+++ b/include/tao/pegtl/utf16.hpp
@@ -24,6 +24,7 @@ namespace tao
template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf16, Lo, Hi > {};
template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf16, Cs... > {};
template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf16, Cs >... > {};
+ struct bom : one< 0xfeff > {};
// clang-format on
} // namespace utf16
diff --git a/include/tao/pegtl/utf8.hpp b/include/tao/pegtl/utf8.hpp
index 9ec046eb2..4813f4e1d 100644
--- a/include/tao/pegtl/utf8.hpp
+++ b/include/tao/pegtl/utf8.hpp
@@ -24,6 +24,7 @@ namespace tao
template< char32_t Lo, char32_t Hi > struct range : internal::range< internal::result_on_found::SUCCESS, internal::peek_utf8, Lo, Hi > {};
template< char32_t... Cs > struct ranges : internal::ranges< internal::peek_utf8, Cs... > {};
template< char32_t... Cs > struct string : internal::seq< internal::one< internal::result_on_found::SUCCESS, internal::peek_utf8, Cs >... > {};
+ struct bom : one< 0xfeff > {};
// clang-format on
} // namespace utf8
diff --git a/src/test/pegtl/utf16_general.cpp b/src/test/pegtl/utf16_general.cpp
index 931859bbb..a1e2b7c96 100644
--- a/src/test/pegtl/utf16_general.cpp
+++ b/src/test/pegtl/utf16_general.cpp
@@ -49,6 +49,9 @@ namespace tao
verify_rule< utf16::one< 0x20ac > >( __LINE__, __FILE__, u16s( 0x20ac ), result_type::SUCCESS, 0 );
verify_rule< utf16::one< 0x10437 > >( __LINE__, __FILE__, u16s( 0xd801 ) + u16s( 0xdc37 ), result_type::SUCCESS, 0 );
+ verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfeff ), result_type::SUCCESS, 0 );
+ verify_rule< utf16::bom >( __LINE__, __FILE__, u16s( 0xfffe ), result_type::LOCAL_FAILURE, 2 );
+
verify_rule< utf16::string< 0x20, 0x20ac, 0x10437 > >( __LINE__, __FILE__, u16s( 0x20 ) + u16s( 0x20ac ) + u16s( 0xd801 ) + u16s( 0xdc37 ) + u16s( 0x20 ), result_type::SUCCESS, 2 );
}
diff --git a/src/test/pegtl/utf8_general.cpp b/src/test/pegtl/utf8_general.cpp
index 2caad339e..6091b8d72 100644
--- a/src/test/pegtl/utf8_general.cpp
+++ b/src/test/pegtl/utf8_general.cpp
@@ -252,6 +252,9 @@ namespace tao
verify_rule< utf8::one< 0x20ac > >( __LINE__, __FILE__, "\xe2\x82\xac", result_type::SUCCESS, 0 );
verify_rule< utf8::one< 0x10348 > >( __LINE__, __FILE__, "\xf0\x90\x8d\x88", result_type::SUCCESS, 0 );
+ verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf", result_type::SUCCESS, 0 );
+ verify_rule< utf8::bom >( __LINE__, __FILE__, "\xef\xbb\xbf ", result_type::SUCCESS, 1 );
+
verify_rule< utf8::string< 0x20, 0xa2, 0x20ac, 0x10348 > >( __LINE__, __FILE__, "\x20\xc2\xa2\xe2\x82\xac\xf0\x90\x8d\x88\x20", result_type::SUCCESS, 1 );
}