From ae70b41d4f46641dbc45c7a4f87954aea356283e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 25 Feb 2022 12:48:15 -0500 Subject: [PATCH] security: fix denial-of-service bug in compiler The regex compiler will happily attempt to compile '(?:){294967295}' by compiling the empty sub-expression 294,967,295 times. Empty sub-expressions don't use any memory in the current implementation, so this doesn't trigger the pre-existing machinery for stopping compilation early if the regex object gets too big. The end result is that while compilation will eventually succeed, it takes a very long time to do so. In this commit, we fix this problem by adding a fake amount of memory every time we compile an empty sub-expression. It turns out we were already tracking an additional amount of indirect heap usage via 'extra_inst_bytes' in the compiler, so we just make it look like compiling an empty sub-expression actually adds an additional 'Inst' to the compiled regex object. This has the effect of causing the regex compiler to reject this sort of regex in a reasonable amount of time by default. Many thanks to @VTCAKAVSMoACE for reporting this, providing the valuable test cases and continuing to test this patch as it was developed. Fixes https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 --- src/compile.rs | 27 +++++++++++++++-- tests/test_default.rs | 70 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/compile.rs b/src/compile.rs index 9a2ed5e92..069f445c8 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -38,6 +38,16 @@ pub struct Compiler { suffix_cache: SuffixCache, utf8_seqs: Option, byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. extra_inst_bytes: usize, } @@ -260,7 +270,7 @@ impl Compiler { self.check_size()?; match *expr.kind() { - Empty => Ok(None), + Empty => self.c_empty(), Literal(hir::Literal::Unicode(c)) => self.c_char(c), Literal(hir::Literal::Byte(b)) => { assert!(self.compiled.uses_bytes()); @@ -378,6 +388,19 @@ impl Compiler { } } + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::(); + Ok(None) + } + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { if self.num_exprs > 1 || self.compiled.is_dfa { // Don't ever compile Save instructions for regex sets because @@ -496,7 +519,7 @@ impl Compiler { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { match exprs.next() { - None => return Ok(None), + None => return self.c_empty(), Some(e) => { if let Some(p) = self.c(e)? { break p; diff --git a/tests/test_default.rs b/tests/test_default.rs index d4365fbb3..be627f7a6 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -150,3 +150,73 @@ fn regex_is_reasonably_small() { assert_eq!(16, size_of::()); assert_eq!(16, size_of::()); } + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +}