From ae70b41d4f46641dbc45c7a4f87954aea356283e Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 25 Feb 2022 12:48:15 -0500
Subject: [PATCH] security: fix denial-of-service bug in compiler

The regex compiler will happily attempt to compile '(?:){294967295}' by
compiling the empty sub-expression 294,967,295 times. Empty
sub-expressions don't use any memory in the current implementation, so
this doesn't trigger the pre-existing machinery for stopping compilation
early if the regex object gets too big. The end result is that while
compilation will eventually succeed, it takes a very long time to do so.

In this commit, we fix this problem by adding a fake amount of memory
every time we compile an empty sub-expression. It turns out we were
already tracking an additional amount of indirect heap usage via
'extra_inst_bytes' in the compiler, so we just make it look like
compiling an empty sub-expression actually adds an additional 'Inst' to
the compiled regex object.

This has the effect of causing the regex compiler to reject this sort of
regex in a reasonable amount of time by default.

Many thanks to @VTCAKAVSMoACE for reporting this, providing the valuable
test cases and continuing to test this patch as it was developed.

Fixes https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
---
 src/compile.rs        | 27 +++++++++++++++--
 tests/test_default.rs | 70 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/src/compile.rs b/src/compile.rs
index 9a2ed5e92..069f445c8 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -38,6 +38,16 @@ pub struct Compiler {
     suffix_cache: SuffixCache,
     utf8_seqs: Option<Utf8Sequences>,
     byte_classes: ByteClassSet,
+    // This keeps track of extra bytes allocated while compiling the regex
+    // program. Currently, this corresponds to two things. First is the heap
+    // memory allocated by Unicode character classes ('InstRanges'). Second is
+    // a "fake" amount of memory used by empty sub-expressions, so that enough
+    // empty sub-expressions will ultimately trigger the compiler to bail
+    // because of a size limit restriction. (That empty sub-expressions don't
+    // add to heap memory usage is more-or-less an implementation detail.) In
+    // the second case, if we don't bail, then an excessively large repetition
+    // on an empty sub-expression can result in the compiler using a very large
+    // amount of CPU time.
     extra_inst_bytes: usize,
 }
 
@@ -260,7 +270,7 @@ impl Compiler {
 
         self.check_size()?;
         match *expr.kind() {
-            Empty => Ok(None),
+            Empty => self.c_empty(),
             Literal(hir::Literal::Unicode(c)) => self.c_char(c),
             Literal(hir::Literal::Byte(b)) => {
                 assert!(self.compiled.uses_bytes());
@@ -378,6 +388,19 @@ impl Compiler {
         }
     }
 
+    fn c_empty(&mut self) -> ResultOrEmpty {
+        // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+        // See: CVE-2022-24713
+        //
+        // Since 'empty' sub-expressions don't increase the size of
+        // the actual compiled object, we "fake" an increase in its
+        // size so that our 'check_size_limit' routine will eventually
+        // stop compilation if there are too many empty sub-expressions
+        // (e.g., via a large repetition).
+        self.extra_inst_bytes += std::mem::size_of::<Inst>();
+        Ok(None)
+    }
+
     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
         if self.num_exprs > 1 || self.compiled.is_dfa {
             // Don't ever compile Save instructions for regex sets because
@@ -496,7 +519,7 @@ impl Compiler {
         let mut exprs = exprs.into_iter();
         let Patch { mut hole, entry } = loop {
             match exprs.next() {
-                None => return Ok(None),
+                None => return self.c_empty(),
                 Some(e) => {
                     if let Some(p) = self.c(e)? {
                         break p;
diff --git a/tests/test_default.rs b/tests/test_default.rs
index d4365fbb3..be627f7a6 100644
--- a/tests/test_default.rs
+++ b/tests/test_default.rs
@@ -150,3 +150,73 @@ fn regex_is_reasonably_small() {
     assert_eq!(16, size_of::<bytes::Regex>());
     assert_eq!(16, size_of::<bytes::RegexSet>());
 }
+
+// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
+// See: CVE-2022-24713
+//
+// We test that our regex compiler will correctly return a "too big" error when
+// we try to use a very large repetition on an *empty* sub-expression.
+//
+// At the time this test was written, the regex compiler does not represent
+// empty sub-expressions with any bytecode instructions. In effect, it's an
+// "optimization" to leave them out, since they would otherwise correspond
+// to an unconditional JUMP in the regex bytecode (i.e., an unconditional
+// epsilon transition in the NFA graph). Therefore, an empty sub-expression
+// represents an interesting case for the compiler's size limits. Since it
+// doesn't actually contribute any additional memory to the compiled regex
+// instructions, the size limit machinery never detects it. Instead, it just
+// dumbly tries to compile the empty sub-expression N times, where N is the
+// repetition size.
+//
+// When N is very large, this will cause the compiler to essentially spin and
+// do nothing for a decently large amount of time. It causes the regex to take
+// quite a bit of time to compile, despite the concrete syntax of the regex
+// being quite small.
+//
+// The degree to which this is actually a problem is somewhat of a judgment
+// call. Some regexes simply take a long time to compile. But in general, you
+// should be able to reasonably control this by setting lower or higher size
+// limits on the compiled object size. But this mitigation doesn't work at all
+// for this case.
+//
+// This particular test is somewhat narrow. It merely checks that regex
+// compilation will, at some point, return a "too big" error. Before the
+// fix landed, this test would eventually fail because the regex would be
+// successfully compiled (after enough time elapsed). So while this test
+// doesn't check that we exit in a reasonable amount of time, it does at least
+// check that we are properly returning an error at some point.
+#[test]
+fn big_empty_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new("(?:){4294967295}");
+    assert!(result.is_err());
+}
+
+// Below is a "billion laughs" variant of the previous test case.
+#[test]
+fn big_empty_reps_chain_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}");
+    assert!(result.is_err());
+}
+
+// Below is another situation where a zero-length sub-expression can be
+// introduced.
+#[test]
+fn big_zero_reps_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new(r"x{0}{4294967295}");
+    assert!(result.is_err());
+}
+
+// Testing another case for completeness.
+#[test]
+fn empty_alt_regex_fails() {
+    use regex::Regex;
+
+    let result = Regex::new(r"(?:|){4294967295}");
+    assert!(result.is_err());
+}