From 457b1ffab6d99b4b9db0e4579c2be4624ba1b3aa Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 7 Mar 2022 16:52:38 -0500 Subject: [PATCH] Enable RegexOptions.RightToLeft and lookbehinds in compiler / source generator (#66280) * Enable RegexOptions.RightToLeft and lookbehinds in compiler / source generator For .NET 7 we rewrote RegexCompiler as we were writing the source generator, and in doing so we left out support for RegexOptions.RightToLeft as well as lookbehinds (which are implemented via RightToLeft). This adds support for both. I initially started incrementally adding in support for various constructs in lookbehinds, but from a testing perspective it made more sense to just add it all, as then all of the RightToLeft tests are used to validate the constructs that are also in lookbehinds. * Address PR feedback --- .../gen/RegexGenerator.Emitter.cs | 637 +++++++---- .../gen/RegexGenerator.Parser.cs | 4 +- .../Text/RegularExpressions/RegexCompiler.cs | 987 ++++++++++++++---- .../RegexFindOptimizations.cs | 27 +- .../RegularExpressions/RegexLWCGCompiler.cs | 2 +- .../Text/RegularExpressions/RegexNode.cs | 47 +- .../RegularExpressions/RegexTreeAnalyzer.cs | 36 +- .../FunctionalTests/Regex.Match.Tests.cs | 29 +- .../RegexGeneratorParserTests.cs | 45 - .../tests/UnitTests/RegexTreeAnalyzerTests.cs | 1 - 10 files changed, 1313 insertions(+), 502 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 2c72270daa358e..7275ad92fa85e4 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -102,49 +102,6 @@ static uint ComputeStringHash(string s) } } - /// Gets whether a given regular expression method is supported by the code generator. - private static bool SupportsCodeGeneration(RegexMethod rm, out string? reason) - { - RegexNode root = rm.Tree.Root; - - if (!root.SupportsCompilation(out reason)) - { - return false; - } - - if (ExceedsMaxDepthForSimpleCodeGeneration(root, allowedDepth: 40)) - { - // Deep RegexNode trees can result in emitting C# code that exceeds C# compiler - // limitations, leading to "CS8078: An expression is too long or complex to compile". - // Place an artificial limit on max tree depth in order to mitigate such issues. - // The allowed depth can be tweaked as needed;its exceedingly rare to find - // expressions with such deep trees. - reason = "the regex will result in code that may exceed C# compiler limits"; - return false; - } - - return true; - - static bool ExceedsMaxDepthForSimpleCodeGeneration(RegexNode node, int allowedDepth) - { - if (allowedDepth <= 0) - { - return true; - } - - int childCount = node.ChildCount(); - for (int i = 0; i < childCount; i++) - { - if (ExceedsMaxDepthForSimpleCodeGeneration(node.Child(i), allowedDepth - 1)) - { - return true; - } - } - - return false; - } - } - /// Generates the code for a regular expression method. private static ImmutableArray EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, string id, bool allowUnsafe) { @@ -164,11 +121,11 @@ private static ImmutableArray EmitRegexMethod(IndentedTextWriter wri writer.Write(" public static global::System.Text.RegularExpressions.Regex Instance { get; } = "); // If we can't support custom generation for this regex, spit out a Regex constructor call. - if (!SupportsCodeGeneration(rm, out string? reason)) + if (!rm.Tree.Root.SupportsCompilation(out string? reason)) { writer.WriteLine(); - writer.WriteLine($"// Cannot generate Regex-derived implementation because {reason}."); - writer.WriteLine($"new global::System.Text.RegularExpressions.Regex({patternExpression}, {optionsExpression}, {timeoutExpression});"); + writer.WriteLine($" // Cannot generate Regex-derived implementation because {reason}."); + writer.WriteLine($" new global::System.Text.RegularExpressions.Regex({patternExpression}, {optionsExpression}, {timeoutExpression});"); writer.WriteLine("}"); return ImmutableArray.Create(Diagnostic.Create(DiagnosticDescriptors.LimitedSourceGeneration, rm.MethodSyntax.GetLocation())); } @@ -345,6 +302,8 @@ static void AppendHashtableContents(IndentedTextWriter writer, Hashtable ht) /// Emits the body of the Scan method override. private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string id) { + bool rtl = (rm.Options & RegexOptions.RightToLeft) != 0; + using (EmitBlock(writer, "while (TryFindNextPossibleStartingPosition(text))")) { if (rm.MatchTimeout != Timeout.Infinite) @@ -354,13 +313,13 @@ private static void EmitScan(IndentedTextWriter writer, RegexMethod rm, string i } writer.WriteLine("// If we find a match on the current position, or we have reached the end of the input, we are done."); - using (EmitBlock(writer, "if (TryMatchAtCurrentPosition(text) || base.runtextpos == text.Length)")) + using (EmitBlock(writer, $"if (TryMatchAtCurrentPosition(text) || base.runtextpos == {(!rtl ? "text.Length" : "0")})")) { writer.WriteLine("return;"); } writer.WriteLine(); - writer.WriteLine("base.runtextpos++;"); + writer.WriteLine($"base.runtextpos{(!rtl ? "++" : "--")};"); } } @@ -371,6 +330,7 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I RegexTree regexTree = rm.Tree; bool hasTextInfo = false; RequiredHelperFunctions requiredHelpers = RequiredHelperFunctions.None; + bool rtl = (options & RegexOptions.RightToLeft) != 0; // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later. // To handle that, we build up a collection of all the declarations to include, track where they should be inserted, @@ -389,11 +349,14 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I // especially since we want the "return false" code regardless. int minRequiredLength = rm.Tree.FindOptimizations.MinRequiredLength; Debug.Assert(minRequiredLength >= 0); - string clause = minRequiredLength switch + string clause = (minRequiredLength, rtl) switch { - 0 => "if (pos <= inputSpan.Length)", - 1 => "if (pos < inputSpan.Length)", - _ => $"if (pos < inputSpan.Length - {minRequiredLength - 1})" + (0, false) => "if (pos <= inputSpan.Length)", + (1, false) => "if (pos < inputSpan.Length)", + (_, false) => $"if (pos <= inputSpan.Length - {minRequiredLength})", + (0, true) => "if (pos >= 0)", + (1, true) => "if (pos > 0)", + (_, true) => $"if (pos >= {minRequiredLength})", }; using (EmitBlock(writer, clause)) { @@ -412,15 +375,26 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf(regexTree.FindOptimizations.LeadingCaseSensitivePrefix); + EmitIndexOf_LeftToRight(regexTree.FindOptimizations.LeadingCaseSensitivePrefix); + break; + + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(regexTree.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_RightToLeft(regexTree.FindOptimizations.LeadingCaseSensitivePrefix); break; - case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: - case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: + case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive: + Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_LeftToRight(); + break; + + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: Debug.Assert(regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - EmitFixedSet(); + EmitFixedSet_RightToLeft(); break; case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: @@ -443,7 +417,7 @@ private static RequiredHelperFunctions EmitTryFindNextPossibleStartingPosition(I const string NoStartingPositionFound = "NoStartingPositionFound"; writer.WriteLine("// No starting position found"); writer.WriteLine($"{NoStartingPositionFound}:"); - writer.WriteLine("base.runtextpos = inputSpan.Length;"); + writer.WriteLine($"base.runtextpos = {(!rtl ? "inputSpan.Length" : "0")};"); writer.WriteLine("return false;"); // We're done. Patch up any additional declarations. @@ -462,17 +436,21 @@ bool EmitAnchors() { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: writer.WriteLine("// Beginning \\A anchor"); - using (EmitBlock(writer, "if (pos > 0)")) + using (EmitBlock(writer, "if (pos != 0)")) { + // If we're not currently at the beginning, we'll never be, so fail immediately. Goto(NoStartingPositionFound); } writer.WriteLine("return true;"); return true; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: writer.WriteLine("// Start \\G anchor"); - using (EmitBlock(writer, "if (pos > base.runtextstart)")) + using (EmitBlock(writer, "if (pos != base.runtextstart)")) { + // For both left-to-right and right-to-left, if we're not currently at the starting (because + // we've already moved beyond it), then we'll never be, so fail immediately. Goto(NoStartingPositionFound); } writer.WriteLine("return true;"); @@ -482,6 +460,8 @@ bool EmitAnchors() writer.WriteLine("// Leading end \\Z anchor"); using (EmitBlock(writer, "if (pos < inputSpan.Length - 1)")) { + // If we're not currently at the end (or a newline just before it), skip ahead + // since nothing until then can possibly match. writer.WriteLine("base.runtextpos = inputSpan.Length - 1;"); } writer.WriteLine("return true;"); @@ -491,11 +471,47 @@ bool EmitAnchors() writer.WriteLine("// Leading end \\z anchor"); using (EmitBlock(writer, "if (pos < inputSpan.Length)")) { + // If we're not currently at the end (or a newline just before it), skip ahead + // since nothing until then can possibly match. writer.WriteLine("base.runtextpos = inputSpan.Length;"); } writer.WriteLine("return true;"); return true; + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + writer.WriteLine("// Beginning \\A anchor"); + using (EmitBlock(writer, "if (pos != 0)")) + { + // If we're not currently at the beginning, skip ahead (or, rather, backwards) + // since nothing until then can possibly match. (We're iterating from the end + // to the beginning in RightToLeft mode.) + writer.WriteLine("base.runtextpos = 0;"); + } + writer.WriteLine("return true;"); + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + writer.WriteLine("// Leading end \\Z anchor"); + using (EmitBlock(writer, "if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\\n'))")) + { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. + Goto(NoStartingPositionFound); + } + writer.WriteLine("return true;"); + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + writer.WriteLine("// Leading end \\z anchor"); + using (EmitBlock(writer, "if (pos < inputSpan.Length)")) + { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. + Goto(NoStartingPositionFound); + } + writer.WriteLine("return true;"); + return true; + case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: // Jump to the end, minus the min required length, which in this case is actually the fixed length, minus 1 (for a possible ending \n). writer.WriteLine("// Trailing end \\Z anchor with fixed-length match"); @@ -576,7 +592,7 @@ bool EmitAnchors() } // Emits a case-sensitive prefix search for a string at the beginning of the pattern. - void EmitIndexOf(string prefix) + void EmitIndexOf_LeftToRight(string prefix) { writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos), {Literal(prefix)});"); writer.WriteLine("if (i >= 0)"); @@ -586,9 +602,20 @@ void EmitIndexOf(string prefix) writer.WriteLine("}"); } + // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. + void EmitIndexOf_RightToLeft(string prefix) + { + writer.WriteLine($"pos = global::System.MemoryExtensions.LastIndexOf(inputSpan.Slice(0, pos), {Literal(prefix)});"); + writer.WriteLine("if (pos >= 0)"); + writer.WriteLine("{"); + writer.WriteLine($" base.runtextpos = pos + {prefix.Length};"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); + } + // Emits a search for a set at a fixed position from the start of the pattern, // and potentially other sets at other fixed positions in the pattern. - void EmitFixedSet() + void EmitFixedSet_LeftToRight() { List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = regexTree.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; @@ -702,6 +729,35 @@ void EmitFixedSet() loopBlock.Dispose(); } + // Emits a right-to-left search for a set at a fixed position from the start of the pattern. + // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) + void EmitFixedSet_RightToLeft() + { + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = regexTree.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); + + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) + { + writer.WriteLine($"pos = global::System.MemoryExtensions.LastIndexOf(inputSpan.Slice(0, pos), {Literal(set.Chars[0])});"); + writer.WriteLine("if (pos >= 0)"); + writer.WriteLine("{"); + writer.WriteLine(" base.runtextpos = pos + 1;"); + writer.WriteLine(" return true;"); + writer.WriteLine("}"); + } + else + { + using (EmitBlock(writer, "while ((uint)--pos < (uint)inputSpan.Length)")) + { + using (EmitBlock(writer, $"if ({MatchCharacterClass(hasTextInfo, options, "inputSpan[pos]", set.Set, set.CaseInsensitive, negate: false, additionalDeclarations, ref requiredHelpers)})")) + { + writer.WriteLine("base.runtextpos = pos + 1;"); + writer.WriteLine("return true;"); + } + } + } + } + // Emits a search for a literal following a leading atomic single-character loop. void EmitLiteralAfterAtomicLoop() { @@ -833,7 +889,7 @@ private static RequiredHelperFunctions EmitTryMatchAtCurrentPosition(IndentedTex // This is the case for single and multiple characters, though the whole thing is only guaranteed // to have been validated in TryFindNextPossibleStartingPosition when doing case-sensitive comparison. writer.WriteLine($"int start = base.runtextpos;"); - writer.WriteLine($"int end = start + {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); + writer.WriteLine($"int end = start {((node.Options & RegexOptions.RightToLeft) == 0 ? "+" : "-")} {(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1)};"); writer.WriteLine("base.Capture(0, start, end);"); writer.WriteLine("base.runtextpos = end;"); writer.WriteLine("return true;"); @@ -1001,13 +1057,17 @@ string SpanLengthCheck(int requiredLength, string? dynamicRequiredLength = null) // Adds the value of sliceStaticPos into the pos local, slices slice by the corresponding amount, // and zeros out sliceStaticPos. - void TransferSliceStaticPosToPos() + void TransferSliceStaticPosToPos(bool forceSliceReload = false) { if (sliceStaticPos > 0) { EmitAdd(writer, "pos", sliceStaticPos); - writer.WriteLine($"{sliceSpan} = {sliceSpan}.Slice({sliceStaticPos});"); sliceStaticPos = 0; + SliceInputSpan(writer); + } + else if (forceSliceReload) + { + SliceInputSpan(writer); } } @@ -1035,16 +1095,20 @@ void EmitAlternation(RegexNode node) // the whole alternation can be treated as a simple switch, so we special-case that. However, // we can't goto _into_ switch cases, which means we can't use this approach if there's any // possibility of backtracking into the alternation. - bool useSwitchedBranches = isAtomic; - if (!useSwitchedBranches) + bool useSwitchedBranches = false; + if ((node.Options & RegexOptions.RightToLeft) == 0) { - useSwitchedBranches = true; - for (int i = 0; i < childCount; i++) + useSwitchedBranches = isAtomic; + if (!useSwitchedBranches) { - if (analysis.MayBacktrack(node.Child(i))) + useSwitchedBranches = true; + for (int i = 0; i < childCount; i++) { - useSwitchedBranches = false; - break; + if (analysis.MayBacktrack(node.Child(i))) + { + useSwitchedBranches = false; + break; + } } } } @@ -1413,20 +1477,47 @@ void EmitWhenHasCapture() additionalDeclarations.Add("int matchLength = 0;"); writer.WriteLine($"matchLength = base.MatchLength({capnum});"); - if (!IsCaseInsensitive(node)) + bool caseInsensitive = IsCaseInsensitive(node); + + if ((node.Options & RegexOptions.RightToLeft) == 0) { - // If we're case-sensitive, we can simply validate that the remaining length of the slice is sufficient - // to possibly match, and then do a SequenceEqual against the matched text. - writer.WriteLine($"if ({sliceSpan}.Length < matchLength || "); - using (EmitBlock(writer, $" !global::System.MemoryExtensions.SequenceEqual(inputSpan.Slice(base.MatchIndex({capnum}), matchLength), {sliceSpan}.Slice(0, matchLength)))")) + if (!caseInsensitive) { - Goto(doneLabel); + // If we're case-sensitive, we can simply validate that the remaining length of the slice is sufficient + // to possibly match, and then do a SequenceEqual against the matched text. + writer.WriteLine($"if ({sliceSpan}.Length < matchLength || "); + using (EmitBlock(writer, $" !global::System.MemoryExtensions.SequenceEqual(inputSpan.Slice(base.MatchIndex({capnum}), matchLength), {sliceSpan}.Slice(0, matchLength)))")) + { + Goto(doneLabel); + } } + else + { + // For case-insensitive, we have to walk each character individually. + using (EmitBlock(writer, $"if ({sliceSpan}.Length < matchLength)")) + { + Goto(doneLabel); + } + writer.WriteLine(); + + additionalDeclarations.Add("int matchIndex = 0;"); + writer.WriteLine($"matchIndex = base.MatchIndex({capnum});"); + using (EmitBlock(writer, $"for (int i = 0; i < matchLength; i++)")) + { + using (EmitBlock(writer, $"if ({ToLower(hasTextInfo, options, $"inputSpan[matchIndex + i]")} != {ToLower(hasTextInfo, options, $"{sliceSpan}[i]")})")) + { + Goto(doneLabel); + } + } + } + + writer.WriteLine(); + writer.WriteLine($"pos += matchLength;"); + SliceInputSpan(writer); } else { - // For case-insensitive, we have to walk each character individually. - using (EmitBlock(writer, $"if ({sliceSpan}.Length < matchLength)")) + using (EmitBlock(writer, $"if (pos < matchLength)")) { Goto(doneLabel); } @@ -1436,16 +1527,15 @@ void EmitWhenHasCapture() writer.WriteLine($"matchIndex = base.MatchIndex({capnum});"); using (EmitBlock(writer, $"for (int i = 0; i < matchLength; i++)")) { - using (EmitBlock(writer, $"if ({ToLower(hasTextInfo, options, $"inputSpan[matchIndex + i]")} != {ToLower(hasTextInfo, options, $"{sliceSpan}[i]")})")) + using (EmitBlock(writer, $"if ({ToLowerIfNeeded(hasTextInfo, options, $"inputSpan[matchIndex + i]", caseInsensitive)} != {ToLowerIfNeeded(hasTextInfo, options, $"inputSpan[pos - matchLength + i]", caseInsensitive)})")) { Goto(doneLabel); } } - } - writer.WriteLine(); - writer.WriteLine($"pos += matchLength;"); - SliceInputSpan(writer); + writer.WriteLine(); + writer.WriteLine($"pos -= matchLength;"); + } } } @@ -1619,7 +1709,7 @@ void EmitExpressionConditional(RegexNode node) // The first child node is the condition expression. If this matches, then we branch to the "yes" branch. // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" - // branch, otherwise. The condition is treated as a positive lookahead. + // branch, otherwise. The condition is treated as a positive lookaround. RegexNode condition = node.Child(0); // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus @@ -1650,12 +1740,12 @@ void EmitExpressionConditional(RegexNode node) } // Emit the condition expression. Route any failures to after the yes branch. This code is almost - // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position + // the same as for a positive lookaround; however, a positive lookaround only needs to reset the position // on a successful match, as a failed match fails the whole expression; here, we need to reset the // position on completion, regardless of whether the match is successful or not. doneLabel = expressionNotMatched; - // Save off pos. We'll need to reset this upon successful completion of the lookahead. + // Save off pos. We'll need to reset this upon successful completion of the lookaround. string startingPos = ReserveName("conditionalexpression_starting_pos"); writer.WriteLine($"int {startingPos} = pos;"); writer.WriteLine(); @@ -1669,7 +1759,7 @@ void EmitExpressionConditional(RegexNode node) doneLabel = originalDoneLabel; // After the condition completes successfully, reset the text positions. - // Do not reset captures, which persist beyond the lookahead. + // Do not reset captures, which persist beyond the lookaround. writer.WriteLine("// Condition matched:"); writer.WriteLine($"pos = {startingPos};"); SliceInputSpan(writer); @@ -1846,14 +1936,26 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) } } - // Emits the code to handle a positive lookahead assertion. - void EmitPositiveLookaheadAssertion(RegexNode node) + // Emits the code to handle a positive lookaround assertion. This is a positive lookahead + // for left-to-right and a positive lookbehind for right-to-left. + void EmitPositiveLookaroundAssertion(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); - // Save off pos. We'll need to reset this upon successful completion of the lookahead. - string startingPos = ReserveName("positivelookahead_starting_pos"); + if (analysis.HasRightToLeft) + { + // Lookarounds are the only places in the node tree where we might change direction, + // i.e. where we might go from RegexOptions.None to RegexOptions.RightToLeft, or vice + // versa. This is because lookbehinds are implemented by making the whole subgraph be + // RegexOptions.RightToLeft and reversed. Since we use static position to optimize left-to-right + // and don't use it in support of right-to-left, we need to resync the static position + // to the current position when entering a lookaround, just in case we're changing direction. + TransferSliceStaticPosToPos(forceSliceReload: true); + } + + // Save off pos. We'll need to reset this upon successful completion of the lookaround. + string startingPos = ReserveName("positivelookaround_starting_pos"); writer.WriteLine($"int {startingPos} = pos;"); writer.WriteLine(); int startingSliceStaticPos = sliceStaticPos; @@ -1871,28 +1973,40 @@ void EmitPositiveLookaheadAssertion(RegexNode node) } // After the child completes successfully, reset the text positions. - // Do not reset captures, which persist beyond the lookahead. + // Do not reset captures, which persist beyond the lookaround. writer.WriteLine(); writer.WriteLine($"pos = {startingPos};"); SliceInputSpan(writer); sliceStaticPos = startingSliceStaticPos; } - // Emits the code to handle a negative lookahead assertion. - void EmitNegativeLookaheadAssertion(RegexNode node) + // Emits the code to handle a negative lookaround assertion. This is a negative lookahead + // for left-to-right and a negative lookbehind for right-to-left. + void EmitNegativeLookaroundAssertion(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + if (analysis.HasRightToLeft) + { + // Lookarounds are the only places in the node tree where we might change direction, + // i.e. where we might go from RegexOptions.None to RegexOptions.RightToLeft, or vice + // versa. This is because lookbehinds are implemented by making the whole subgraph be + // RegexOptions.RightToLeft and reversed. Since we use static position to optimize left-to-right + // and don't use it in support of right-to-left, we need to resync the static position + // to the current position when entering a lookaround, just in case we're changing direction. + TransferSliceStaticPosToPos(forceSliceReload: true); + } + string originalDoneLabel = doneLabel; - // Save off pos. We'll need to reset this upon successful completion of the lookahead. - string startingPos = ReserveName("negativelookahead_starting_pos"); + // Save off pos. We'll need to reset this upon successful completion of the lookaround. + string startingPos = ReserveName("negativelookaround_starting_pos"); writer.WriteLine($"int {startingPos} = pos;"); int startingSliceStaticPos = sliceStaticPos; - string negativeLookaheadDoneLabel = ReserveName("NegativeLookaheadMatch"); - doneLabel = negativeLookaheadDoneLabel; + string negativeLookaroundDoneLabel = ReserveName("NegativeLookaroundMatch"); + doneLabel = negativeLookaroundDoneLabel; // Emit the child. RegexNode child = node.Child(0); @@ -1906,16 +2020,16 @@ void EmitNegativeLookaheadAssertion(RegexNode node) EmitNode(child); } - // If the generated code ends up here, it matched the lookahead, which actually - // means failure for a _negative_ lookahead, so we need to jump to the original done. + // If the generated code ends up here, it matched the lookaround, which actually + // means failure for a _negative_ lookaround, so we need to jump to the original done. writer.WriteLine(); Goto(originalDoneLabel); writer.WriteLine(); - // Failures (success for a negative lookahead) jump here. - MarkLabel(negativeLookaheadDoneLabel, emitSemicolon: false); + // Failures (success for a negative lookaround) jump here. + MarkLabel(negativeLookaroundDoneLabel, emitSemicolon: false); - // After the child completes in failure (success for negative lookahead), reset the text positions. + // After the child completes in failure (success for negative lookaround), reset the text positions. writer.WriteLine($"pos = {startingPos};"); SliceInputSpan(writer); sliceStaticPos = startingSliceStaticPos; @@ -1932,6 +2046,16 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck return; } + if ((node.Options & RegexOptions.RightToLeft) != 0) + { + // RightToLeft doesn't take advantage of static positions. While RightToLeft won't update static + // positions, a previous operation may have left us with a non-zero one. Make sure it's zero'd out + // such that pos and slice are up-to-date. Note that RightToLeft also shouldn't use the slice span, + // as it's not kept up-to-date; any RightToLeft implementation that wants to use it must first update + // it from pos. + TransferSliceStaticPosToPos(); + } + // Separate out several node types that, for conciseness, don't need a header and scope written into the source. switch (node.Kind) { @@ -2039,11 +2163,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNodeKind.PositiveLookaround: - EmitPositiveLookaheadAssertion(node); + EmitPositiveLookaroundAssertion(node); break; case RegexNodeKind.NegativeLookaround: - EmitNegativeLookaheadAssertion(node); + EmitNegativeLookaroundAssertion(node); break; case RegexNodeKind.UpdateBumpalong: @@ -2112,7 +2236,9 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence // and then skip the individual length checks for each. We also want to minimize the repetition of if blocks, // and so we try to emit a series of clauses all part of the same if block rather than one if block per child. - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + if ((node.Options & RegexOptions.RightToLeft) == 0 && + emitLengthChecksIfRequired && + node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) { bool wroteClauses = true; writer.Write($"if ({SpanLengthCheck(requiredLength)}"); @@ -2220,11 +2346,13 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset { Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Kind}"); - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we map those looping constructs to the - // appropriate single check. + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + Debug.Assert(!rtl || offset is null); + Debug.Assert(!rtl || !clauseOnly); - string expr = $"{sliceSpan}[{Sum(sliceStaticPos, offset)}]"; + string expr = !rtl ? + $"{sliceSpan}[{Sum(sliceStaticPos, offset)}]" : + "inputSpan[pos - 1]"; if (node.IsSetFamily) { @@ -2242,13 +2370,25 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, string? offset } else { - using (EmitBlock(writer, emitLengthCheck ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : $"if ({expr})")) + string clause = + !emitLengthCheck ? $"if ({expr})" : + !rtl ? $"if ({SpanLengthCheck(1, offset)} || {expr})" : + $"if ((uint)(pos - 1) >= inputSpan.Length || {expr})"; + + using (EmitBlock(writer, clause)) { Goto(doneLabel); } } - sliceStaticPos++; + if (!rtl) + { + sliceStaticPos++; + } + else + { + writer.WriteLine("pos--;"); + } } // Emits the code to handle a boundary check on a character. @@ -2283,8 +2423,9 @@ RegexNodeKind.Boundary or void EmitAnchors(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol, $"Unexpected type: {node.Kind}"); - + Debug.Assert((node.Options & RegexOptions.RightToLeft) == 0 || sliceStaticPos == 0); Debug.Assert(sliceStaticPos >= 0); + switch (node.Kind) { case RegexNodeKind.Beginning: @@ -2297,7 +2438,9 @@ void EmitAnchors(RegexNode node) } else { - using (EmitBlock(writer, node.Kind == RegexNodeKind.Beginning ? "if (pos != 0)" : "if (pos != base.runtextstart)")) + using (EmitBlock(writer, node.Kind == RegexNodeKind.Beginning ? + "if (pos != 0)" : + "if (pos != base.runtextstart)")) { Goto(doneLabel); } @@ -2305,48 +2448,40 @@ void EmitAnchors(RegexNode node) break; case RegexNodeKind.Bol: - if (sliceStaticPos > 0) + using (EmitBlock(writer, sliceStaticPos > 0 ? + $"if ({sliceSpan}[{sliceStaticPos - 1}] != '\\n')" : + $"if (pos > 0 && inputSpan[pos - 1] != '\\n')")) { - using (EmitBlock(writer, $"if ({sliceSpan}[{sliceStaticPos - 1}] != '\\n')")) - { - Goto(doneLabel); - } - } - else - { - // We can't use our slice in this case, because we'd need to access slice[-1], so we access the inputSpan field directly: - using (EmitBlock(writer, $"if (pos > 0 && inputSpan[pos - 1] != '\\n')")) - { - Goto(doneLabel); - } + Goto(doneLabel); } break; case RegexNodeKind.End: - using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()})")) + using (EmitBlock(writer, sliceStaticPos > 0 ? + $"if ({sliceStaticPos} < {sliceSpan}.Length)" : + "if ((uint)pos < (uint)inputSpan.Length)")) { Goto(doneLabel); } break; case RegexNodeKind.EndZ: - writer.WriteLine($"if ({sliceSpan}.Length > {sliceStaticPos + 1} || ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n'))"); - using (EmitBlock(writer, null)) + using (EmitBlock(writer, sliceStaticPos > 0 ? + $"if ({sliceStaticPos + 1} < {sliceSpan}.Length || ({sliceStaticPos} < {sliceSpan}.Length && {sliceSpan}[{sliceStaticPos}] != '\\n'))" : + "if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\\n'))")) { Goto(doneLabel); } break; case RegexNodeKind.Eol: - using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n')")) + using (EmitBlock(writer, sliceStaticPos > 0 ? + $"if ({sliceStaticPos} < {sliceSpan}.Length && {sliceSpan}[{sliceStaticPos}] != '\\n')" : + "if ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\\n')")) { Goto(doneLabel); } break; - - string IsSliceLengthGreaterThanSliceStaticPos() => - sliceStaticPos == 0 ? $"!{sliceSpan}.IsEmpty" : - $"{sliceSpan}.Length > {sliceStaticPos}"; } } @@ -2355,13 +2490,33 @@ void EmitMultiChar(RegexNode node, bool emitLengthCheck) { Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}"); Debug.Assert(node.Str is not null); - EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck); + EmitMultiCharString(node.Str, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0); } - void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck) + void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft) { Debug.Assert(str.Length >= 2); + if (rightToLeft) + { + Debug.Assert(emitLengthCheck); + using (EmitBlock(writer, $"if ((uint)(pos - {str.Length}) >= inputSpan.Length)")) + { + Goto(doneLabel); + } + writer.WriteLine(); + + using (EmitBlock(writer, $"for (int i = 0; i < {str.Length}; i++)")) + { + using (EmitBlock(writer, $"if ({ToLowerIfNeeded(hasTextInfo, options, "inputSpan[--pos]", caseInsensitive)} != {Literal(str)}[{str.Length - 1} - i])")) + { + Goto(doneLabel); + } + } + + return; + } + if (caseInsensitive) // StartsWith(..., XxIgnoreCase) won't necessarily be the same as char-by-char comparison { // This case should be relatively rare. It will only occur with IgnoreCase and a series of non-ASCII characters. @@ -2420,6 +2575,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL string startingPos = ReserveName("charloop_starting_pos"); string endingPos = ReserveName("charloop_ending_pos"); additionalDeclarations.Add($"int {startingPos} = 0, {endingPos} = 0;"); + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; // We're about to enter a loop, so ensure our text position is 0. TransferSliceStaticPosToPos(); @@ -2437,7 +2593,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL TransferSliceStaticPosToPos(); writer.WriteLine($"{endingPos} = pos;"); - EmitAdd(writer, startingPos, node.M); + EmitAdd(writer, startingPos, !rtl ? node.M : -node.M); Goto(endLoop); writer.WriteLine(); @@ -2452,7 +2608,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL EmitStackPop(endingPos, startingPos); writer.WriteLine(); - if (subsequent?.FindStartingLiteral() is ValueTuple literal) + if (!rtl && subsequent?.FindStartingLiteral() is ValueTuple literal) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); using (EmitBlock(writer, @@ -2472,14 +2628,17 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL } else { - using (EmitBlock(writer, $"if ({startingPos} >= {endingPos})")) + using (EmitBlock(writer, $"if ({startingPos} {(!rtl ? ">=" : "<=")} {endingPos})")) { Goto(doneLabel); } - writer.WriteLine($"pos = --{endingPos};"); + writer.WriteLine(!rtl ? $"pos = --{endingPos};" : $"pos = ++{endingPos};"); } - SliceInputSpan(writer); + if (!rtl) + { + SliceInputSpan(writer); + } writer.WriteLine(); MarkLabel(endLoop, emitSemicolon: false); @@ -2586,54 +2745,57 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL // Now that we've appropriately advanced by one character and are set for what comes after the loop, // see if we can skip ahead more iterations by doing a search for a following literal. - if (iterationCount is null && - node.Kind is RegexNodeKind.Notonelazy && - !IsCaseInsensitive(node) && - subsequent?.FindStartingLiteral(4) is ValueTuple literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch - (literal.Item3 is not null ? !literal.Item3.Contains(node.Ch) : (literal.Item2?[0] ?? literal.Item1) != node.Ch)) // no overlap between node.Ch and the start of the literal + if ((node.Options & RegexOptions.RightToLeft) == 0) { - // e.g. "<[^>]*?>" - // This lazy loop will consume all characters other than node.Ch until the subsequent literal. - // We can implement it to search for either that char or the literal, whichever comes first. - // If it ends up being that node.Ch, the loop fails (we're only here if we're backtracking). - writer.WriteLine( - literal.Item2 is not null ? $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item2[0])});" : - literal.Item3 is null ? $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item1)});" : - literal.Item3.Length switch - { - 2 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item3[0])}, {Literal(literal.Item3[1])});", - _ => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch + literal.Item3)});", - }); - using (EmitBlock(writer, $"if ((uint){startingPos} >= (uint){sliceSpan}.Length || {sliceSpan}[{startingPos}] == {Literal(node.Ch)})")) + if (iterationCount is null && + node.Kind is RegexNodeKind.Notonelazy && + !IsCaseInsensitive(node) && + subsequent?.FindStartingLiteral(4) is ValueTuple literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch + (literal.Item3 is not null ? !literal.Item3.Contains(node.Ch) : (literal.Item2?[0] ?? literal.Item1) != node.Ch)) // no overlap between node.Ch and the start of the literal { - Goto(doneLabel); - } - writer.WriteLine($"pos += {startingPos};"); - SliceInputSpan(writer); - } - else if (iterationCount is null && - node.Kind is RegexNodeKind.Setlazy && - node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is ValueTuple literal2) - { - // e.g. ".*?string" with RegexOptions.Singleline - // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal - // isn't found, the loop fails. We can implement it to just search for that literal. - writer.WriteLine( - literal2.Item2 is not null ? $"{startingPos} = global::System.MemoryExtensions.IndexOf({sliceSpan}, {Literal(literal2.Item2)});" : - literal2.Item3 is null ? $"{startingPos} = global::System.MemoryExtensions.IndexOf({sliceSpan}, {Literal(literal2.Item1)});" : - literal2.Item3.Length switch + // e.g. "<[^>]*?>" + // This lazy loop will consume all characters other than node.Ch until the subsequent literal. + // We can implement it to search for either that char or the literal, whichever comes first. + // If it ends up being that node.Ch, the loop fails (we're only here if we're backtracking). + writer.WriteLine( + literal.Item2 is not null ? $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item2[0])});" : + literal.Item3 is null ? $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item1)});" : + literal.Item3.Length switch + { + 2 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch)}, {Literal(literal.Item3[0])}, {Literal(literal.Item3[1])});", + _ => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(node.Ch + literal.Item3)});", + }); + using (EmitBlock(writer, $"if ((uint){startingPos} >= (uint){sliceSpan}.Length || {sliceSpan}[{startingPos}] == {Literal(node.Ch)})")) { - 2 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])});", - 3 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])}, {Literal(literal2.Item3[2])});", - _ => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3)});", - }); - using (EmitBlock(writer, $"if ({startingPos} < 0)")) + Goto(doneLabel); + } + writer.WriteLine($"pos += {startingPos};"); + SliceInputSpan(writer); + } + else if (iterationCount is null && + node.Kind is RegexNodeKind.Setlazy && + node.Str == RegexCharClass.AnyClass && + subsequent?.FindStartingLiteral() is ValueTuple literal2) { - Goto(doneLabel); + // e.g. ".*?string" with RegexOptions.Singleline + // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal + // isn't found, the loop fails. We can implement it to just search for that literal. + writer.WriteLine( + literal2.Item2 is not null ? $"{startingPos} = global::System.MemoryExtensions.IndexOf({sliceSpan}, {Literal(literal2.Item2)});" : + literal2.Item3 is null ? $"{startingPos} = global::System.MemoryExtensions.IndexOf({sliceSpan}, {Literal(literal2.Item1)});" : + literal2.Item3.Length switch + { + 2 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])});", + 3 => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3[0])}, {Literal(literal2.Item3[1])}, {Literal(literal2.Item3[2])});", + _ => $"{startingPos} = global::System.MemoryExtensions.IndexOfAny({sliceSpan}, {Literal(literal2.Item3)});", + }); + using (EmitBlock(writer, $"if ({startingPos} < 0)")) + { + Goto(doneLabel); + } + writer.WriteLine($"pos += {startingPos};"); + SliceInputSpan(writer); } - writer.WriteLine($"pos += {startingPos};"); - SliceInputSpan(writer); } // Store the position we've left off at in case we need to iterate again. @@ -2887,6 +3049,8 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Kind}"); int iterations = node.M; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + switch (iterations) { case 0: @@ -2901,11 +3065,20 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node): // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time. - EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck); + EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthCheck, rtl); return; } - if (iterations <= MaxUnrollSize) + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static position with rtl + using (EmitBlock(writer, $"for (int i = 0; i < {iterations}; i++)")) + { + EmitTimeoutCheck(writer, hasTimeout); + EmitSingleChar(node); + } + } + else if (iterations <= MaxUnrollSize) { // if ((uint)(sliceStaticPos + iterations - 1) >= (uint)slice.Length || // slice[sliceStaticPos] != c1 || @@ -2981,12 +3154,38 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = Debug.Assert(node.N > node.M); int minIterations = node.M; int maxIterations = node.N; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today int numSetChars = 0; - string iterationLocal = ReserveName("iteration"); - if (node.IsNotoneFamily && + + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static position for rtl + + string expr = $"inputSpan[pos - {iterationLocal} - 1]"; + if (node.IsSetFamily) + { + expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers); + } + else + { + expr = ToLowerIfNeeded(hasTextInfo, options, expr, IsCaseInsensitive(node)); + expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; + } + + writer.WriteLine($"int {iterationLocal} = 0;"); + + string maxClause = maxIterations != int.MaxValue ? $"{CountIsLessThan(iterationLocal, maxIterations)} && " : ""; + using (EmitBlock(writer, $"while ({maxClause}pos > {iterationLocal} && {expr})")) + { + EmitTimeoutCheck(writer, hasTimeout); + writer.WriteLine($"{iterationLocal}++;"); + } + writer.WriteLine(); + } + else if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node))) { @@ -3097,8 +3296,15 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // Now that we've completed our optional iterations, advance the text span // and pos by the number of iterations completed. - writer.WriteLine($"{sliceSpan} = {sliceSpan}.Slice({iterationLocal});"); - writer.WriteLine($"pos += {iterationLocal};"); + if (!rtl) + { + writer.WriteLine($"{sliceSpan} = {sliceSpan}.Slice({iterationLocal});"); + writer.WriteLine($"pos += {iterationLocal};"); + } + else + { + writer.WriteLine($"pos -= {iterationLocal};"); + } } // Emits the code to handle a non-backtracking optional zero-or-one loop. @@ -3107,7 +3313,16 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) Debug.Assert(node.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic, $"Unexpected type: {node.Kind}"); Debug.Assert(node.M == 0 && node.N == 1); - string expr = $"{sliceSpan}[{sliceStaticPos}]"; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static pos for rtl + } + + string expr = !rtl ? + $"{sliceSpan}[{sliceStaticPos}]" : + "inputSpan[pos - 1]"; + if (node.IsSetFamily) { expr = MatchCharacterClass(hasTextInfo, options, expr, node.Str!, IsCaseInsensitive(node), negate: false, additionalDeclarations, ref requiredHelpers); @@ -3118,11 +3333,22 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; } - string spaceAvailable = sliceStaticPos != 0 ? $"(uint){sliceSpan}.Length > (uint){sliceStaticPos}" : $"!{sliceSpan}.IsEmpty"; + string spaceAvailable = + rtl ? "pos > 0" : + sliceStaticPos != 0 ? $"(uint){sliceSpan}.Length > (uint){sliceStaticPos}" : + $"!{sliceSpan}.IsEmpty"; + using (EmitBlock(writer, $"if ({spaceAvailable} && {expr})")) { - writer.WriteLine($"{sliceSpan} = {sliceSpan}.Slice(1);"); - writer.WriteLine($"pos++;"); + if (!rtl) + { + writer.WriteLine($"{sliceSpan} = {sliceSpan}.Slice(1);"); + writer.WriteLine($"pos++;"); + } + else + { + writer.WriteLine($"pos--;"); + } } } @@ -3726,8 +3952,10 @@ private static string Literal(RegexOptions options) } /// Gets a textual description of the node fit for rendering in a comment in source. - private static string DescribeNode(RegexNode node, AnalysisResults analysis) => - node.Kind switch + private static string DescribeNode(RegexNode node, AnalysisResults analysis) + { + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + return node.Kind switch { RegexNodeKind.Alternate => $"Match with {node.ChildCount()} alternative expressions{(analysis.IsAtomicByAncestor(node) ? ", atomically" : "")}.", RegexNodeKind.Atomic => $"Atomic group.", @@ -3744,18 +3972,18 @@ private static string DescribeNode(RegexNode node, AnalysisResults analysis) => RegexNodeKind.EndZ => "Match if at the end of the string or if before an ending newline.", RegexNodeKind.Eol => "Match if at the end of a line.", RegexNodeKind.Loop or RegexNodeKind.Lazyloop => node.M == 0 && node.N == 1 ? $"Optional ({(node.Kind is RegexNodeKind.Loop ? "greedy" : "lazy")})." : $"Loop {DescribeLoop(node, analysis)}.", - RegexNodeKind.Multi => $"Match the string {Literal(node.Str!)}.", + RegexNodeKind.Multi => $"Match the string {Literal(node.Str!)}{(rtl ? " backwards" : "")}.", RegexNodeKind.NonBoundary => $"Match if at anything other than a word boundary.", RegexNodeKind.NonECMABoundary => $"Match if at anything other than a word boundary (according to ECMAScript rules).", RegexNodeKind.Nothing => $"Fail to match.", - RegexNodeKind.Notone => $"Match any character other than {Literal(node.Ch)}.", + RegexNodeKind.Notone => $"Match any character other than {Literal(node.Ch)}{(rtl ? " backwards" : "")}.", RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy => $"Match a character other than {Literal(node.Ch)} {DescribeLoop(node, analysis)}.", - RegexNodeKind.One => $"Match {Literal(node.Ch)}.", + RegexNodeKind.One => $"Match {Literal(node.Ch)}{(rtl ? " backwards" : "")}.", RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy => $"Match {Literal(node.Ch)} {DescribeLoop(node, analysis)}.", - RegexNodeKind.NegativeLookaround => $"Zero-width negative lookahead assertion.", + RegexNodeKind.NegativeLookaround => $"Zero-width negative {(rtl ? "lookbehind" : "lookahead")}.", RegexNodeKind.Backreference => $"Match the same text as matched by the {DescribeCapture(node.M, analysis)}.", - RegexNodeKind.PositiveLookaround => $"Zero-width positive lookahead assertion.", - RegexNodeKind.Set => $"Match {DescribeSet(node.Str!)}.", + RegexNodeKind.PositiveLookaround => $"Zero-width positive {(rtl ? "lookbehind" : "lookahead")}.", + RegexNodeKind.Set => $"Match {DescribeSet(node.Str!)}{(rtl ? " backwards" : "")}.", RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy => $"Match {DescribeSet(node.Str!)} {DescribeLoop(node, analysis)}.", RegexNodeKind.Start => "Match if at the start position.", RegexNodeKind.ExpressionConditional => $"Conditionally match one of two expressions depending on whether an initial expression matches.", @@ -3763,6 +3991,7 @@ private static string DescribeNode(RegexNode node, AnalysisResults analysis) => RegexNodeKind.UpdateBumpalong => $"Advance the next matching position.", _ => $"Unknown node type {node.Kind}", }; + } /// Gets an identifer to describe a capture group. private static string DescribeCapture(int capNum, AnalysisResults analysis) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs index 6b601c8d859001..7c3121ee2e7d3e 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Parser.cs @@ -143,7 +143,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M return Diagnostic.Create(DiagnosticDescriptors.InvalidLangVersion, methodSyntax.GetLocation()); } - RegexOptions regexOptions = RegexOptions.Compiled | (options is not null ? (RegexOptions)options : RegexOptions.None); + RegexOptions regexOptions = options is not null ? (RegexOptions)options : RegexOptions.None; // TODO: This is going to include the culture that's current at the time of compilation. // What should we do about that? We could: @@ -181,7 +181,7 @@ private static bool IsSemanticTargetForGeneration(SemanticModel semanticModel, M RegexTree tree; try { - tree = RegexParser.Parse(pattern, regexOptions, culture); + tree = RegexParser.Parse(pattern, regexOptions | RegexOptions.Compiled, culture); // make sure Compiled is included to get all optimizations applied to it } catch (Exception e) { diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 670b76f8dd1b44..717923c8227857 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -208,6 +208,9 @@ private void Mvfldloc(FieldInfo ft, LocalBuilder lt) /// A macro for _ilg.Emit(OpCodes.Call, mt). protected void Call(MethodInfo mt) => _ilg!.Emit(OpCodes.Call, mt); + /// A macro for _ilg.Emit(OpCodes.Brfalse) (short jump). + private void Brfalse(Label l) => _ilg!.Emit(OpCodes.Brfalse_S, l); + /// A macro for _ilg.Emit(OpCodes.Brfalse) (long form). private void BrfalseFar(Label l) => _ilg!.Emit(OpCodes.Brfalse, l); @@ -370,6 +373,7 @@ protected void EmitTryFindNextPossibleStartingPosition() LocalBuilder inputSpan = DeclareReadOnlySpanChar(); LocalBuilder pos = DeclareInt32(); + bool rtl = (_options & RegexOptions.RightToLeft) != 0; _textInfo = null; if ((_options & RegexOptions.CultureInvariant) == 0) @@ -407,33 +411,47 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or Label returnFalse = DefineLabel(); Label finishedLengthCheck = DefineLabel(); - // if (pos > inputSpan.Length - _code.Tree.MinRequiredLength) + // if (pos > inputSpan.Length - minRequiredLength) // or pos < minRequiredLength for rtl // { - // base.runtextpos = inputSpan.Length; + // base.runtextpos = inputSpan.Length; // or 0 for rtl // return false; // } Ldloc(pos); - Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - if (minRequiredLength > 0) + if (!rtl) + { + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + if (minRequiredLength > 0) + { + Ldc(minRequiredLength); + Sub(); + } + Ble(finishedLengthCheck); + } + else { Ldc(minRequiredLength); - Sub(); + Bge(finishedLengthCheck); } - Ble(finishedLengthCheck); MarkLabel(returnFalse); Ldthis(); - Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - + if (!rtl) + { + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + } + else + { + Ldc(0); + } Stfld(s_runtextposField); Ldc(0); Ret(); MarkLabel(finishedLengthCheck); // Emit any anchors. - if (GenerateAnchors()) + if (EmitAnchors()) { return; } @@ -446,6 +464,11 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or EmitIndexOf_LeftToRight(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix); break; + case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: + Debug.Assert(!string.IsNullOrEmpty(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix)); + EmitIndexOf_RightToLeft(_regexTree.FindOptimizations.LeadingCaseSensitivePrefix); + break; + case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: @@ -454,6 +477,12 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or EmitFixedSet_LeftToRight(); break; + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseSensitive: + case FindNextStartingPositionMode.LeadingSet_RightToLeft_CaseInsensitive: + Debug.Assert(_regexTree.FindOptimizations.FixedDistanceSets is { Count: > 0 }); + EmitFixedSet_RightToLeft(); + break; + case FindNextStartingPositionMode.LiteralAfterLoop_LeftToRight_CaseSensitive: Debug.Assert(_regexTree.FindOptimizations.LiteralAfterLoop is not null); EmitLiteralAfterAtomicLoop(); @@ -472,7 +501,7 @@ FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive or // Emits any anchors. Returns true if the anchor roots any match to a specific location and thus no further // searching is required; otherwise, false. - bool GenerateAnchors() + bool EmitAnchors() { Label label; @@ -480,28 +509,29 @@ bool GenerateAnchors() switch (_regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - label = DefineLabel(); + // if (pos != 0) goto returnFalse; + // return true; Ldloc(pos); Ldc(0); - Ble(label); - Br(returnFalse); - MarkLabel(label); + Bne(returnFalse); Ldc(1); Ret(); return true; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - label = DefineLabel(); + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: + // if (pos != base.runtextstart) goto returnFalse; + // return true; Ldloc(pos); Ldthisfld(s_runtextstartField); - Ble(label); - Br(returnFalse); - MarkLabel(label); + Bne(returnFalse); Ldc(1); Ret(); return true; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: + // if (pos < inputSpan.Length - 1) base.runtextpos = inputSpan.Length - 1; + // return true; label = DefineLabel(); Ldloc(pos); Ldloca(inputSpan); @@ -521,6 +551,8 @@ bool GenerateAnchors() return true; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: + // if (pos < inputSpan.Length) base.runtextpos = inputSpan.Length; + // return true; label = DefineLabel(); Ldloc(pos); Ldloca(inputSpan); @@ -535,6 +567,57 @@ bool GenerateAnchors() Ret(); return true; + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: + // if (pos != 0) base.runtextpos = 0; + // return true; + label = DefineLabel(); + Ldloc(pos); + Ldc(0); + Beq(label); + Ldthis(); + Ldc(0); + Stfld(s_runtextposField); + MarkLabel(label); + Ldc(1); + Ret(); + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: + // if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n') goto returnFalse; + // return true; + label = DefineLabel(); + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + Ldc(1); + Sub(); + Blt(returnFalse); + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + BgeUn(label); + Ldloca(inputSpan); + Ldloc(pos); + Call(s_spanGetItemMethod); + LdindU2(); + Ldc('\n'); + Bne(returnFalse); + MarkLabel(label); + Ldc(1); + Ret(); + return true; + + case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: + // if (pos < inputSpan.Length) goto returnFalse; + // return true; + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + Blt(returnFalse); + Ldc(1); + Ret(); + return true; + case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_End: case FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ: // Jump to the end, minus the min required length, which in this case is actually the fixed length. @@ -561,112 +644,115 @@ bool GenerateAnchors() } // Now handle anchors that boost the position but don't determine immediate success or failure. - - switch (_regexTree.FindOptimizations.LeadingAnchor) + if (!rtl) // we haven't done the work to validate these optimizations for RightToLeft { - case RegexNodeKind.Bol: - { - // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike - // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike - // the other anchors, which all skip all subsequent processing if found, with BOL we just use it - // to boost our position to the next line, and then continue normally with any prefix or char class searches. - - label = DefineLabel(); - - // if (pos > 0... - Ldloc(pos!); - Ldc(0); - Ble(label); + switch (_regexTree.FindOptimizations.LeadingAnchor) + { + case RegexNodeKind.Bol: + { + // Optimize the handling of a Beginning-Of-Line (BOL) anchor. BOL is special, in that unlike + // other anchors like Beginning, there are potentially multiple places a BOL can match. So unlike + // the other anchors, which all skip all subsequent processing if found, with BOL we just use it + // to boost our position to the next line, and then continue normally with any prefix or char class searches. - // ... && inputSpan[pos - 1] != '\n') { ... } - Ldloca(inputSpan); - Ldloc(pos); - Ldc(1); - Sub(); - Call(s_spanGetItemMethod); - LdindU2(); - Ldc('\n'); - Beq(label); + label = DefineLabel(); - // int tmp = inputSpan.Slice(pos).IndexOf('\n'); - Ldloca(inputSpan); - Ldloc(pos); - Call(s_spanSliceIntMethod); - Ldc('\n'); - Call(s_spanIndexOfChar); - using (RentedLocalBuilder newlinePos = RentInt32Local()) - { - Stloc(newlinePos); - - // if (newlinePos < 0 || newlinePos + pos + 1 > inputSpan.Length) - // { - // base.runtextpos = inputSpan.Length; - // return false; - // } - Ldloc(newlinePos); + // if (pos > 0... + Ldloc(pos!); Ldc(0); - Blt(returnFalse); - Ldloc(newlinePos); - Ldloc(pos); - Add(); - Ldc(1); - Add(); - Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - Bgt(returnFalse); + Ble(label); - // pos += newlinePos + 1; + // ... && inputSpan[pos - 1] != '\n') { ... } + Ldloca(inputSpan); Ldloc(pos); - Ldloc(newlinePos); - Add(); Ldc(1); - Add(); - Stloc(pos); + Sub(); + Call(s_spanGetItemMethod); + LdindU2(); + Ldc('\n'); + Beq(label); - // We've updated the position. Make sure there's still enough room in the input for a possible match. - // if (pos > inputSpan.Length - minRequiredLength) returnFalse; + // int tmp = inputSpan.Slice(pos).IndexOf('\n'); Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - if (minRequiredLength != 0) + Ldloc(pos); + Call(s_spanSliceIntMethod); + Ldc('\n'); + Call(s_spanIndexOfChar); + using (RentedLocalBuilder newlinePos = RentInt32Local()) { - Ldc(minRequiredLength); - Sub(); + Stloc(newlinePos); + + // if (newlinePos < 0 || newlinePos + pos + 1 > inputSpan.Length) + // { + // base.runtextpos = inputSpan.Length; + // return false; + // } + Ldloc(newlinePos); + Ldc(0); + Blt(returnFalse); + Ldloc(newlinePos); + Ldloc(pos); + Add(); + Ldc(1); + Add(); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + Bgt(returnFalse); + + // pos += newlinePos + 1; + Ldloc(pos); + Ldloc(newlinePos); + Add(); + Ldc(1); + Add(); + Stloc(pos); + + // We've updated the position. Make sure there's still enough room in the input for a possible match. + // if (pos > inputSpan.Length - minRequiredLength) returnFalse; + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + if (minRequiredLength != 0) + { + Ldc(minRequiredLength); + Sub(); + } + Ldloc(pos); + BltFar(returnFalse); } - Ldloc(pos); - BltFar(returnFalse); - } - - MarkLabel(label); - } - break; - } - switch (_regexTree.FindOptimizations.TrailingAnchor) - { - case RegexNodeKind.End or RegexNodeKind.EndZ when _regexTree.FindOptimizations.MaxPossibleLength is int maxLength: - // Jump to the end, minus the max allowed length. - { - int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; - label = DefineLabel(); - Ldloc(pos); - Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - Ldc(maxLength + extraNewlineBump); - Sub(); - Bge(label); - Ldloca(inputSpan); - Call(s_spanGetLengthMethod); - Ldc(maxLength + extraNewlineBump); - Sub(); - Stloc(pos); - MarkLabel(label); + MarkLabel(label); + } break; - } + } + + switch (_regexTree.FindOptimizations.TrailingAnchor) + { + case RegexNodeKind.End or RegexNodeKind.EndZ when _regexTree.FindOptimizations.MaxPossibleLength is int maxLength: + // Jump to the end, minus the max allowed length. + { + int extraNewlineBump = _regexTree.FindOptimizations.FindMode == FindNextStartingPositionMode.TrailingAnchor_FixedLength_LeftToRight_EndZ ? 1 : 0; + label = DefineLabel(); + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + Ldc(maxLength + extraNewlineBump); + Sub(); + Bge(label); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + Ldc(maxLength + extraNewlineBump); + Sub(); + Stloc(pos); + MarkLabel(label); + break; + } + } } return false; } + // Emits a case-sensitive prefix search for a string at the beginning of the pattern. void EmitIndexOf_LeftToRight(string prefix) { using RentedLocalBuilder i = RentInt32Local(); @@ -696,6 +782,37 @@ void EmitIndexOf_LeftToRight(string prefix) Ret(); } + // Emits a case-sensitive right-to-left prefix search for a string at the beginning of the pattern. + void EmitIndexOf_RightToLeft(string prefix) + { + // pos = inputSpan.Slice(0, pos).LastIndexOf(prefix); + Ldloca(inputSpan); + Ldc(0); + Ldloc(pos); + Call(s_spanSliceIntIntMethod); + Ldstr(prefix); + Call(s_stringAsSpanMethod); + Call(s_spanLastIndexOfSpan); + Stloc(pos); + + // if (pos < 0) goto ReturnFalse; + Ldloc(pos); + Ldc(0); + BltFar(returnFalse); + + // base.runtextpos = pos + prefix.Length; + // return true; + Ldthis(); + Ldloc(pos); + Ldc(prefix.Length); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); + } + + // Emits a search for a set at a fixed position from the start of the pattern, + // and potentially other sets at other fixed positions in the pattern. void EmitFixedSet_LeftToRight() { List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = _regexTree.FindOptimizations.FixedDistanceSets; @@ -891,6 +1008,74 @@ void EmitFixedSet_LeftToRight() } } + // Emits a right-to-left search for a set at a fixed position from the start of the pattern. + // (Currently that position will always be a distance of 0, meaning the start of the pattern itself.) + void EmitFixedSet_RightToLeft() + { + (char[]? Chars, string Set, int Distance, bool CaseInsensitive) set = _regexTree.FindOptimizations.FixedDistanceSets![0]; + Debug.Assert(set.Distance == 0); + + if (set.Chars is { Length: 1 } && !set.CaseInsensitive) + { + // pos = inputSpan.Slice(0, pos).LastIndexOf(set.Chars[0]); + Ldloca(inputSpan); + Ldc(0); + Ldloc(pos); + Call(s_spanSliceIntIntMethod); + Ldc(set.Chars[0]); + Call(s_spanLastIndexOfChar); + Stloc(pos); + + // if (pos < 0) goto returnFalse; + Ldloc(pos); + Ldc(0); + BltFar(returnFalse); + + // base.runtextpos = pos + 1; + // return true; + Ldthis(); + Ldloc(pos); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); + } + else + { + Label condition = DefineLabel(); + + // while ((uint)--pos < (uint)inputSpan.Length) + MarkLabel(condition); + Ldloc(pos); + Ldc(1); + Sub(); + Stloc(pos); + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + BgeUnFar(returnFalse); + + // if (!MatchCharacterClass(inputSpan[i], set.Set, set.CaseInsensitive)) goto condition; + Ldloca(inputSpan); + Ldloc(pos); + Call(s_spanGetItemMethod); + LdindU2(); + EmitMatchCharacterClass(set.Set, set.CaseInsensitive); + Brfalse(condition); + + // base.runtextpos = pos + 1; + // return true; + Ldthis(); + Ldloc(pos); + Ldc(1); + Add(); + Stfld(s_runtextposField); + Ldc(1); + Ret(); + } + } + // Emits a search for a literal following a leading atomic single-character loop. void EmitLiteralAfterAtomicLoop() { @@ -1079,24 +1264,29 @@ protected void EmitTryMatchAtCurrentPosition() // base.Capture(0, base.runtextpos, base.runtextpos + node.Str.Length); // base.runtextpos = base.runtextpos + node.Str.Length; // return true; + int length = node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1; + if ((node.Options & RegexOptions.RightToLeft) != 0) + { + length = -length; + } Ldthis(); Dup(); Ldc(0); Ldthisfld(s_runtextposField); Dup(); - Ldc(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1); + Ldc(length); Add(); Call(s_captureMethod); Ldthisfld(s_runtextposField); - Ldc(node.Kind == RegexNodeKind.Multi ? node.Str!.Length : 1); + Ldc(length); Add(); Stfld(s_runtextposField); Ldc(1); Ret(); return; - // The source generator special-cases RegexNode.Empty, for purposes of code learning rather than - // performance. Since that's not applicable to RegexCompiler, that code isn't mirrored here. + // The source generator special-cases RegexNode.Empty, for purposes of code learning rather than + // performance. Since that's not applicable to RegexCompiler, that code isn't mirrored here. } AnalysisResults analysis = RegexTreeAnalyzer.Analyze(_regexTree); @@ -1268,26 +1458,27 @@ void EmitTextSpanOffset() } } - // Adds the value of sliceStaticPos into the pos local, slices textspan by the corresponding amount, - // and zeros out sliceStaticPos. - void TransferSliceStaticPosToPos() + // Adds the value of sliceStaticPos into the pos local, zeros out sliceStaticPos, + // and resets slice to be inputSpan.Slice(pos). + void TransferSliceStaticPosToPos(bool forceSliceReload = false) { if (sliceStaticPos > 0) { // pos += sliceStaticPos; + // sliceStaticPos = 0; Ldloc(pos); Ldc(sliceStaticPos); Add(); Stloc(pos); - - // slice = slice.Slice(sliceStaticPos); - Ldloca(slice); - Ldc(sliceStaticPos); - Call(s_spanSliceIntMethod); - Stloc(slice); - - // sliceStaticPos = 0; sliceStaticPos = 0; + + // slice = inputSpan.Slice(pos); + SliceInputSpan(); + } + else if (forceSliceReload) + { + // slice = inputSpan.Slice(pos); + SliceInputSpan(); } } @@ -1474,6 +1665,7 @@ void EmitBackreference(RegexNode node) Debug.Assert(node.Kind is RegexNodeKind.Backreference, $"Unexpected type: {node.Kind}"); int capnum = RegexParser.MapCaptureNumber(node.M, _regexTree!.CaptureNumberSparseMapping); + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; TransferSliceStaticPosToPos(); @@ -1495,9 +1687,17 @@ void EmitBackreference(RegexNode node) Call(s_matchLengthMethod); Stloc(matchLength); - // if (slice.Length < matchLength) goto doneLabel; - Ldloca(slice); - Call(s_spanGetLengthMethod); + if (!rtl) + { + // if (slice.Length < matchLength) goto doneLabel; + Ldloca(slice); + Call(s_spanGetLengthMethod); + } + else + { + // if (pos < matchLength) goto doneLabel; + Ldloc(pos); + } Ldloc(matchLength); BltFar(doneLabel); @@ -1517,7 +1717,7 @@ void EmitBackreference(RegexNode node) MarkLabel(body); - // if (inputSpan[matchIndex + i] != slice[i]) goto doneLabel; + // if (inputSpan[matchIndex + i] != slice[i]) goto doneLabel; // for rtl, instead of slice[i] using inputSpan[pos - matchLength + i] Ldloca(inputSpan); Ldloc(matchIndex); Ldloc(i); @@ -1528,8 +1728,20 @@ void EmitBackreference(RegexNode node) { CallToLower(); } - Ldloca(slice); - Ldloc(i); + if (!rtl) + { + Ldloca(slice); + Ldloc(i); + } + else + { + Ldloca(inputSpan); + Ldloc(pos); + Ldloc(matchLength); + Sub(); + Ldloc(i); + Add(); + } Call(s_spanGetItemMethod); LdindU2(); if (IsCaseInsensitive(node)) @@ -1550,12 +1762,23 @@ void EmitBackreference(RegexNode node) Ldloc(matchLength); Blt(body); - // pos += matchLength; + // pos += matchLength; // or -= for rtl Ldloc(pos); Ldloc(matchLength); - Add(); + if (!rtl) + { + Add(); + } + else + { + Sub(); + } Stloc(pos); - SliceInputSpan(); + + if (!rtl) + { + SliceInputSpan(); + } MarkLabel(backreferenceEnd); } @@ -1719,7 +1942,7 @@ void EmitExpressionConditional(RegexNode node) // The first child node is the condition expression. If this matches, then we branch to the "yes" branch. // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" - // branch, otherwise. The condition is treated as a positive lookahead. + // branch, otherwise. The condition is treated as a positive lookaround. RegexNode condition = node.Child(0); // Get the "yes" branch and the "no" branch. The "no" branch is optional in syntax and is thus @@ -1753,12 +1976,12 @@ void EmitExpressionConditional(RegexNode node) } // Emit the condition expression. Route any failures to after the yes branch. This code is almost - // the same as for a positive lookahead; however, a positive lookahead only needs to reset the position + // the same as for a positive lookaround; however, a positive lookaround only needs to reset the position // on a successful match, as a failed match fails the whole expression; here, we need to reset the // position on completion, regardless of whether the match is successful or not. doneLabel = expressionNotMatched; - // Save off pos. We'll need to reset this upon successful completion of the lookahead. + // Save off pos. We'll need to reset this upon successful completion of the lookaround. // startingPos = pos; LocalBuilder startingPos = DeclareInt32(); Ldloc(pos); @@ -1771,7 +1994,7 @@ void EmitExpressionConditional(RegexNode node) doneLabel = originalDoneLabel; // After the condition completes successfully, reset the text positions. - // Do not reset captures, which persist beyond the lookahead. + // Do not reset captures, which persist beyond the lookaround. // pos = startingPos; // slice = inputSpan.Slice(pos); Ldloc(startingPos); @@ -2012,13 +2235,25 @@ void EmitUncaptureUntil(LocalBuilder startingCapturePos) Bgt(body); } - // Emits the code to handle a positive lookahead assertion. - void EmitPositiveLookaheadAssertion(RegexNode node) + // Emits the code to handle a positive lookaround assertion. This is a positive lookahead + // for left-to-right and a positive lookbehind for right-to-left. + void EmitPositiveLookaroundAssertion(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.PositiveLookaround, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); - // Save off pos. We'll need to reset this upon successful completion of the lookahead. + if (analysis.HasRightToLeft) + { + // Lookarounds are the only places in the node tree where we might change direction, + // i.e. where we might go from RegexOptions.None to RegexOptions.RightToLeft, or vice + // versa. This is because lookbehinds are implemented by making the whole subgraph be + // RegexOptions.RightToLeft and reversed. Since we use static position to optimize left-to-right + // and don't use it in support of right-to-left, we need to resync the static position + // to the current position when entering a lookaround, just in case we're changing direction. + TransferSliceStaticPosToPos(forceSliceReload: true); + } + + // Save off pos. We'll need to reset this upon successful completion of the lookaround. // startingPos = pos; LocalBuilder startingPos = DeclareInt32(); Ldloc(pos); @@ -2038,7 +2273,7 @@ void EmitPositiveLookaheadAssertion(RegexNode node) } // After the child completes successfully, reset the text positions. - // Do not reset captures, which persist beyond the lookahead. + // Do not reset captures, which persist beyond the lookaround. // pos = startingPos; // slice = inputSpan.Slice(pos); Ldloc(startingPos); @@ -2047,15 +2282,27 @@ void EmitPositiveLookaheadAssertion(RegexNode node) sliceStaticPos = startingTextSpanPos; } - // Emits the code to handle a negative lookahead assertion. - void EmitNegativeLookaheadAssertion(RegexNode node) + // Emits the code to handle a negative lookaround assertion. This is a negative lookahead + // for left-to-right and a negative lookbehind for right-to-left. + void EmitNegativeLookaroundAssertion(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.NegativeLookaround, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() == 1, $"Expected 1 child, found {node.ChildCount()}"); + if (analysis.HasRightToLeft) + { + // Lookarounds are the only places in the node tree where we might change direction, + // i.e. where we might go from RegexOptions.None to RegexOptions.RightToLeft, or vice + // versa. This is because lookbehinds are implemented by making the whole subgraph be + // RegexOptions.RightToLeft and reversed. Since we use static position to optimize left-to-right + // and don't use it in support of right-to-left, we need to resync the static position + // to the current position when entering a lookaround, just in case we're changing direction. + TransferSliceStaticPosToPos(forceSliceReload: true); + } + Label originalDoneLabel = doneLabel; - // Save off pos. We'll need to reset this upon successful completion of the lookahead. + // Save off pos. We'll need to reset this upon successful completion of the lookaround. // startingPos = pos; LocalBuilder startingPos = DeclareInt32(); Ldloc(pos); @@ -2077,19 +2324,19 @@ void EmitNegativeLookaheadAssertion(RegexNode node) EmitNode(child); } - // If the generated code ends up here, it matched the lookahead, which actually - // means failure for a _negative_ lookahead, so we need to jump to the original done. + // If the generated code ends up here, it matched the lookaround, which actually + // means failure for a _negative_ lookaround, so we need to jump to the original done. // goto originalDoneLabel; BrFar(originalDoneLabel); - // Failures (success for a negative lookahead) jump here. + // Failures (success for a negative lookaround) jump here. MarkLabel(negativeLookaheadDoneLabel); if (doneLabel == negativeLookaheadDoneLabel) { doneLabel = originalDoneLabel; } - // After the child completes in failure (success for negative lookahead), reset the text positions. + // After the child completes in failure (success for negative lookaround), reset the text positions. // pos = startingPos; Ldloc(startingPos); Stloc(pos); @@ -2108,7 +2355,17 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck return; } - switch (node.Kind) + // RightToLeft doesn't take advantage of static positions. While RightToLeft won't update static + // positions, a previous operation may have left us with a non-zero one. Make sure it's zero'd out + // such that pos and slice are up-to-date. Note that RightToLeft also shouldn't use the slice span, + // as it's not kept up-to-date; any RightToLeft implementation that wants to use it must first update + // it from pos. + if ((node.Options & RegexOptions.RightToLeft) != 0) + { + TransferSliceStaticPosToPos(); + } + + switch (node.Kind) { case RegexNodeKind.Beginning: case RegexNodeKind.Start: @@ -2191,11 +2448,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNodeKind.PositiveLookaround: - EmitPositiveLookaheadAssertion(node); + EmitPositiveLookaroundAssertion(node); break; case RegexNodeKind.NegativeLookaround: - EmitNegativeLookaheadAssertion(node); + EmitNegativeLookaroundAssertion(node); break; case RegexNodeKind.Nothing: @@ -2284,13 +2541,15 @@ void EmitConcatenation(RegexNode node, RegexNode? subsequent, bool emitLengthChe Debug.Assert(node.Kind is RegexNodeKind.Concatenate, $"Unexpected type: {node.Kind}"); Debug.Assert(node.ChildCount() >= 2, $"Expected at least 2 children, found {node.ChildCount()}"); + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + // Emit the code for each child one after the other. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { // If we can find a subsequence of fixed-length children, we can emit a length check once for that sequence // and then skip the individual length checks for each. - if (emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) + if (!rtl && emitLengthChecksIfRequired && node.TryGetJoinableLengthCheckChildRange(i, out int requiredLength, out int exclusiveEnd)) { EmitSpanLengthCheck(requiredLength); for (; i < exclusiveEnd; i++) @@ -2327,19 +2586,44 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o { Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Kind}"); - // This only emits a single check, but it's called from the looping constructs in a loop - // to generate the code for a single check, so we check for each "family" (one, notone, set) - // rather than only for the specific single character nodes. + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + Debug.Assert(!rtl || offset is null); - // if ((uint)(sliceStaticPos + offset) >= slice.Length || slice[sliceStaticPos + offset] != ch) goto Done; if (emitLengthCheck) { - EmitSpanLengthCheck(1, offset); + if (!rtl) + { + // if ((uint)(sliceStaticPos + offset) >= slice.Length) goto Done; + EmitSpanLengthCheck(1, offset); + } + else + { + // if ((uint)(pos - 1) >= inputSpan.Length) goto Done; + Ldloc(pos); + Ldc(1); + Sub(); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + BgeUnFar(doneLabel); + } + } + + if (!rtl) + { + // slice[staticPos + offset] + Ldloca(slice); + EmitSum(sliceStaticPos, offset); + } + else + { + // inputSpan[pos - 1] + Ldloca(inputSpan); + EmitSum(-1, pos); } - Ldloca(slice); - EmitSum(sliceStaticPos, offset); Call(s_spanGetItemMethod); LdindU2(); + + // if (loadedChar != ch) goto doneLabel; if (node.IsSetFamily) { EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); @@ -2362,7 +2646,18 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o } } - sliceStaticPos++; + if (!rtl) + { + sliceStaticPos++; + } + else + { + // pos--; + Ldloc(pos); + Ldc(1); + Sub(); + Stloc(pos); + } } // Emits the code to handle a boundary check on a character. @@ -2370,6 +2665,12 @@ void EmitBoundary(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected type: {node.Kind}"); + if ((node.Options & RegexOptions.RightToLeft) != 0) + { + // RightToLeft doesn't use static position. This ensures it's 0. + TransferSliceStaticPosToPos(); + } + // if (!IsBoundary(inputSpan, pos + sliceStaticPos)) goto doneLabel; Ldthis(); Ldloc(inputSpan); @@ -2408,6 +2709,8 @@ void EmitBoundary(RegexNode node) void EmitAnchors(RegexNode node) { Debug.Assert(node.Kind is RegexNodeKind.Beginning or RegexNodeKind.Start or RegexNodeKind.Bol or RegexNodeKind.End or RegexNodeKind.EndZ or RegexNodeKind.Eol, $"Unexpected type: {node.Kind}"); + Debug.Assert((node.Options & RegexOptions.RightToLeft) == 0 || sliceStaticPos == 0); + Debug.Assert(sliceStaticPos >= 0); Debug.Assert(sliceStaticPos >= 0); switch (node.Kind) @@ -2449,8 +2752,8 @@ void EmitAnchors(RegexNode node) } else { - // We can't use our slice in this case, because we'd need to access slice[-1], so we access the runtext field directly: - // if (pos > 0 && base.runtext[pos - 1] != '\n') goto doneLabel; + // We can't use our slice in this case, because we'd need to access slice[-1], so we access the inputSpan directly: + // if (pos > 0 && inputSpan[pos - 1] != '\n') goto doneLabel; Label success = DefineLabel(); Ldloc(pos); Ldc(0); @@ -2468,17 +2771,35 @@ void EmitAnchors(RegexNode node) break; case RegexNodeKind.End: - // if (sliceStaticPos < slice.Length) goto doneLabel; - Ldc(sliceStaticPos); - Ldloca(slice); + if (sliceStaticPos > 0) + { + // if (sliceStaticPos < slice.Length) goto doneLabel; + Ldc(sliceStaticPos); + Ldloca(slice); + } + else + { + // if (pos < inputSpan.Length) goto doneLabel; + Ldloc(pos); + Ldloca(inputSpan); + } Call(s_spanGetLengthMethod); BltUnFar(doneLabel); break; case RegexNodeKind.EndZ: - // if (sliceStaticPos < slice.Length - 1) goto doneLabel; - Ldc(sliceStaticPos); - Ldloca(slice); + if (sliceStaticPos > 0) + { + // if (sliceStaticPos < slice.Length - 1) goto doneLabel; + Ldc(sliceStaticPos); + Ldloca(slice); + } + else + { + // if (pos < inputSpan.Length - 1) goto doneLabel + Ldloc(pos); + Ldloca(inputSpan); + } Call(s_spanGetLengthMethod); Ldc(1); Sub(); @@ -2486,8 +2807,9 @@ void EmitAnchors(RegexNode node) goto case RegexNodeKind.Eol; case RegexNodeKind.Eol: - // if (sliceStaticPos < slice.Length && slice[sliceStaticPos] != '\n') goto doneLabel; + if (sliceStaticPos > 0) { + // if (sliceStaticPos < slice.Length && slice[sliceStaticPos] != '\n') goto doneLabel; Label success = DefineLabel(); Ldc(sliceStaticPos); Ldloca(slice); @@ -2501,6 +2823,22 @@ void EmitAnchors(RegexNode node) BneFar(doneLabel); MarkLabel(success); } + else + { + // if ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n') goto doneLabel; + Label success = DefineLabel(); + Ldloc(pos); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + BgeUn(success); + Ldloca(inputSpan); + Ldloc(pos); + Call(s_spanGetItemMethod); + LdindU2(); + Ldc('\n'); + BneFar(doneLabel); + MarkLabel(success); + } break; } } @@ -2509,13 +2847,48 @@ void EmitAnchors(RegexNode node) void EmitMultiChar(RegexNode node, bool emitLengthCheck) { Debug.Assert(node.Kind is RegexNodeKind.Multi, $"Unexpected type: {node.Kind}"); - EmitMultiCharString(node.Str!, IsCaseInsensitive(node), emitLengthCheck); + EmitMultiCharString(node.Str!, IsCaseInsensitive(node), emitLengthCheck, (node.Options & RegexOptions.RightToLeft) != 0); } - void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck) + void EmitMultiCharString(string str, bool caseInsensitive, bool emitLengthCheck, bool rightToLeft) { Debug.Assert(str.Length >= 2); + if (rightToLeft) + { + Debug.Assert(emitLengthCheck); + TransferSliceStaticPosToPos(); + + // if ((uint)(pos - str.Length) >= inputSpan.Length) goto doneLabel; + Ldloc(pos); + Ldc(str.Length); + Sub(); + Ldloca(inputSpan); + Call(s_spanGetLengthMethod); + BgeUnFar(doneLabel); + + for (int i = str.Length - 1; i >= 0; i--) + { + // if (inputSpan[--pos] != str[str.Length - 1 - i]) goto doneLabel + Ldloc(pos); + Ldc(1); + Sub(); + Stloc(pos); + Ldloca(inputSpan); + Ldloc(pos); + Call(s_spanGetItemMethod); + LdindU2(); + if (caseInsensitive) + { + CallToLower(); + } + Ldc(str[i]); + BneFar(doneLabel); + } + + return; + } + if (caseInsensitive) // StartsWith(..., XxIgnoreCase) won't necessarily be the same as char-by-char comparison { // This case should be relatively rare. It will only occur with IgnoreCase and a series of non-ASCII characters. @@ -2579,6 +2952,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL LocalBuilder startingPos = DeclareInt32(); LocalBuilder endingPos = DeclareInt32(); LocalBuilder? capturepos = expressionHasCaptures ? DeclareInt32() : null; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; // We're about to enter a loop, so ensure our text position is 0. TransferSliceStaticPosToPos(); @@ -2595,7 +2969,6 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL EmitSingleCharAtomicLoop(node); - // pos += sliceStaticPos; // int endingPos = pos; TransferSliceStaticPosToPos(); Ldloc(pos); @@ -2609,11 +2982,11 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL Stloc(capturepos); } - // startingPos += node.M; + // startingPos += node.M; // or -= for rtl if (node.M > 0) { Ldloc(startingPos); - Ldc(node.M); + Ldc(!rtl ? node.M : -node.M); Add(); Stloc(startingPos); } @@ -2642,12 +3015,19 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL EmitStackPop(); Stloc(startingPos); - // if (startingPos >= endingPos) goto doneLabel; + // if (startingPos >= endingPos) goto doneLabel; // or <= for rtl Ldloc(startingPos); Ldloc(endingPos); - BgeFar(doneLabel); + if (!rtl) + { + BgeFar(doneLabel); + } + else + { + BleFar(doneLabel); + } - if (subsequent?.FindStartingLiteral() is ValueTuple literal) + if (!rtl && subsequent?.FindStartingLiteral() is ValueTuple literal) { // endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal); // if (endingPos < 0) @@ -2720,9 +3100,9 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL } else { - // endingPos--; + // endingPos--; // or ++ for rtl Ldloc(endingPos); - Ldc(1); + Ldc(!rtl ? 1 : -1); Sub(); Stloc(endingPos); } @@ -2731,8 +3111,11 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL Ldloc(endingPos); Stloc(pos); - // slice = inputSpan.Slice(pos); - SliceInputSpan(); + if (!rtl) + { + // slice = inputSpan.Slice(pos); + SliceInputSpan(); + } MarkLabel(endLoop); EmitStackResizeIfNeeded(expressionHasCaptures ? 3 : 2); @@ -2766,6 +3149,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL } Debug.Assert(node.M < node.N); + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; // We now need to match one character at a time, each time allowing the remainder of the expression // to try to match, and only matching another character if the subsequent expression fails to match. @@ -2843,7 +3227,8 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL // Now that we've appropriately advanced by one character and are set for what comes after the loop, // see if we can skip ahead more iterations by doing a search for a following literal. - if (iterationCount is null && + if (!rtl && + iterationCount is null && node.Kind is RegexNodeKind.Notonelazy && !IsCaseInsensitive(node) && subsequent?.FindStartingLiteral(4) is ValueTuple literal && // 5 == max optimized by IndexOfAny, and we need to reserve 1 for node.Ch @@ -2904,7 +3289,8 @@ node.Kind is RegexNodeKind.Notonelazy && Stloc(pos); SliceInputSpan(); } - else if (iterationCount is null && + else if (!rtl && + iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && subsequent?.FindStartingLiteral() is ValueTuple literal2) @@ -3293,6 +3679,8 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr Debug.Assert(node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily, $"Unexpected type: {node.Kind}"); int iterations = node.M; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + switch (iterations) { case 0: @@ -3307,10 +3695,43 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr case <= RegexNode.MultiVsRepeaterLimit when node.IsOneFamily && !IsCaseInsensitive(node): // This is a repeated case-sensitive character; emit it as a multi in order to get all the optimizations // afforded to a multi, e.g. unrolling the loop with multi-char reads/comparisons at a time. - EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthChecksIfRequired); + EmitMultiCharString(new string(node.Ch, iterations), caseInsensitive: false, emitLengthChecksIfRequired, rtl); return; } + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static position with rtl + Label conditionLabel = DefineLabel(); + Label bodyLabel = DefineLabel(); + + // for (int i = 0; ...) + using RentedLocalBuilder iterationLocal = RentInt32Local(); + Ldc(0); + Stloc(iterationLocal); + BrFar(conditionLabel); + + // TimeoutCheck(); + // HandleSingleChar(); + MarkLabel(bodyLabel); + EmitTimeoutCheck(); + EmitSingleChar(node); + + // for (...; ...; i++) + Ldloc(iterationLocal); + Ldc(1); + Add(); + Stloc(iterationLocal); + + // for (...; i < iterations; ...) + MarkLabel(conditionLabel); + Ldloc(iterationLocal); + Ldc(iterations); + BltFar(bodyLabel); + + return; + } + // if ((uint)(sliceStaticPos + iterations - 1) >= (uint)slice.Length) goto doneLabel; if (emitLengthChecksIfRequired) { @@ -3405,6 +3826,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) Debug.Assert(node.N > node.M); int minIterations = node.M; int maxIterations = node.N; + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; using RentedLocalBuilder iterationLocal = RentInt32Local(); @@ -3413,7 +3835,79 @@ void EmitSingleCharAtomicLoop(RegexNode node) Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today int numSetChars = 0; - if (node.IsNotoneFamily && + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static position for rtl + + Label conditionLabel = DefineLabel(); + Label bodyLabel = DefineLabel(); + + // int i = 0; + Ldc(0); + Stloc(iterationLocal); + BrFar(conditionLabel); + + // Body: + // TimeoutCheck(); + MarkLabel(bodyLabel); + EmitTimeoutCheck(); + + // if (pos <= iterationLocal) goto atomicLoopDoneLabel; + Ldloc(pos); + Ldloc(iterationLocal); + BleFar(atomicLoopDoneLabel); + + // if (inputSpan[pos - i - 1] != ch) goto atomicLoopDoneLabel; + Ldloca(inputSpan); + Ldloc(pos); + Ldloc(iterationLocal); + Sub(); + Ldc(1); + Sub(); + Call(s_spanGetItemMethod); + LdindU2(); + if (node.IsSetFamily) + { + EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node)); + BrfalseFar(atomicLoopDoneLabel); + } + else + { + if (IsCaseInsensitive(node)) + { + CallToLower(); + } + Ldc(node.Ch); + if (node.IsOneFamily) + { + BneFar(atomicLoopDoneLabel); + } + else // IsNotoneFamily + { + BeqFar(atomicLoopDoneLabel); + } + } + + // i++; + Ldloc(iterationLocal); + Ldc(1); + Add(); + Stloc(iterationLocal); + + // if (i >= maxIterations) goto atomicLoopDoneLabel; + MarkLabel(conditionLabel); + if (maxIterations != int.MaxValue) + { + Ldloc(iterationLocal); + Ldc(maxIterations); + BltFar(bodyLabel); + } + else + { + BrFar(bodyLabel); + } + } + else if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node))) { @@ -3613,17 +4107,28 @@ void EmitSingleCharAtomicLoop(RegexNode node) // Now that we've completed our optional iterations, advance the text span // and pos by the number of iterations completed. - // slice = slice.Slice(i); - Ldloca(slice); - Ldloc(iterationLocal); - Call(s_spanSliceIntMethod); - Stloc(slice); + if (!rtl) + { + // slice = slice.Slice(i); + Ldloca(slice); + Ldloc(iterationLocal); + Call(s_spanSliceIntMethod); + Stloc(slice); - // pos += i; - Ldloc(pos); - Ldloc(iterationLocal); - Add(); - Stloc(pos); + // pos += i; + Ldloc(pos); + Ldloc(iterationLocal); + Add(); + Stloc(pos); + } + else + { + // pos -= i; + Ldloc(pos); + Ldloc(iterationLocal); + Sub(); + Stloc(pos); + } } // Emits the code to handle a non-backtracking optional zero-or-one loop. @@ -3632,17 +4137,44 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) Debug.Assert(node.Kind is RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic, $"Unexpected type: {node.Kind}"); Debug.Assert(node.M == 0 && node.N == 1); + bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; + if (rtl) + { + TransferSliceStaticPosToPos(); // we don't use static pos for rtl + } + Label skipUpdatesLabel = DefineLabel(); - // if ((uint)sliceStaticPos >= (uint)slice.Length) goto skipUpdatesLabel; - Ldc(sliceStaticPos); - Ldloca(slice); - Call(s_spanGetLengthMethod); - BgeUnFar(skipUpdatesLabel); + if (!rtl) + { + // if ((uint)sliceStaticPos >= (uint)slice.Length) goto skipUpdatesLabel; + Ldc(sliceStaticPos); + Ldloca(slice); + Call(s_spanGetLengthMethod); + BgeUnFar(skipUpdatesLabel); + } + else + { + // if (pos == 0) goto skipUpdatesLabel; + Ldloc(pos); + Ldc(0); + BeqFar(skipUpdatesLabel); + } - // if (slice[sliceStaticPos] != ch) goto skipUpdatesLabel; - Ldloca(slice); - Ldc(sliceStaticPos); + if (!rtl) + { + // if (slice[sliceStaticPos] != ch) goto skipUpdatesLabel; + Ldloca(slice); + Ldc(sliceStaticPos); + } + else + { + // if (inputSpan[pos - 1] != ch) goto skipUpdatesLabel; + Ldloca(inputSpan); + Ldloc(pos); + Ldc(1); + Sub(); + } Call(s_spanGetItemMethod); LdindU2(); if (node.IsSetFamily) @@ -3667,17 +4199,28 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } } - // slice = slice.Slice(1); - Ldloca(slice); - Ldc(1); - Call(s_spanSliceIntMethod); - Stloc(slice); + if (!rtl) + { + // slice = slice.Slice(1); + Ldloca(slice); + Ldc(1); + Call(s_spanSliceIntMethod); + Stloc(slice); - // pos++; - Ldloc(pos); - Ldc(1); - Add(); - Stloc(pos); + // pos++; + Ldloc(pos); + Ldc(1); + Add(); + Stloc(pos); + } + else + { + // pos--; + Ldloc(pos); + Ldc(1); + Sub(); + Stloc(pos); + } MarkLabel(skipUpdatesLabel); } @@ -4026,8 +4569,9 @@ void EmitStackPop() } } - protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, DynamicMethod tryMatchAtCurrentPositionMethod) + protected void EmitScan(RegexOptions options, DynamicMethod tryFindNextStartingPositionMethod, DynamicMethod tryMatchAtCurrentPositionMethod) { + bool rtl = (options & RegexOptions.RightToLeft) != 0; Label returnLabel = DefineLabel(); // while (TryFindNextPossibleStartingPosition(text)) @@ -4045,22 +4589,29 @@ protected void EmitScan(DynamicMethod tryFindNextStartingPositionMethod, Dynamic Call(s_checkTimeoutMethod); } - // if (TryMatchAtCurrentPosition(text) || runtextpos == text.length) + // if (TryMatchAtCurrentPosition(text) || runtextpos == text.length) // or == 0 for rtl // return; Ldthis(); Ldarg_1(); Call(tryMatchAtCurrentPositionMethod); BrtrueFar(returnLabel); Ldthisfld(s_runtextposField); - Ldarga_s(1); - Call(s_spanGetLengthMethod); + if (!rtl) + { + Ldarga_s(1); + Call(s_spanGetLengthMethod); + } + else + { + Ldc(0); + } Ceq(); BrtrueFar(returnLabel); - // runtextpos += 1 + // runtextpos += 1 // or -1 for rtl Ldthis(); Ldthisfld(s_runtextposField); - Ldc(1); + Ldc(!rtl ? 1 : -1); Add(); Stfld(s_runtextposField); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index fafa72042dede2..6a0b41a6c6cf35 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -50,7 +50,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options, CultureInfo // Compute any anchor trailing the expression. If there is one, and we can also compute a fixed length // for the whole expression, we can use that to quickly jump to the right location in the input. - if (!_rightToLeft) // haven't added FindNextStartingPositionMode support for RTL + if (!_rightToLeft) // haven't added FindNextStartingPositionMode trailing anchor support for RTL { bool triedToComputeMaxLength = false; @@ -297,16 +297,18 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos // For others, we can jump to the relevant location. case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning: - if (pos > 0) + if (pos != 0) { + // If we're not currently at the beginning, we'll never be, so fail immediately. pos = textSpan.Length; return false; } return true; case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Start: - if (pos > start) + if (pos != start) { + // If we're not currently at the start, we'll never be, so fail immediately. pos = textSpan.Length; return false; } @@ -315,6 +317,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_EndZ: if (pos < textSpan.Length - 1) { + // If we're not currently at the end (or a newline just before it), skip ahead + // since nothing until then can possibly match. pos = textSpan.Length - 1; } return true; @@ -322,20 +326,29 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.LeadingAnchor_LeftToRight_End: if (pos < textSpan.Length) { + // If we're not currently at the end (or a newline just before it), skip ahead + // since nothing until then can possibly match. pos = textSpan.Length; } return true; case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning: - if (pos > 0) + if (pos != 0) { + // If we're not currently at the beginning, skip ahead (or, rather, backwards) + // since nothing until then can possibly match. (We're iterating from the end + // to the beginning in RightToLeft mode.) pos = 0; } return true; case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Start: - if (pos < start) + if (pos != start) { + // If we're not currently at the starting position, we'll never be, so fail immediately. + // This is different from beginning, since beginning is the fixed location of 0 whereas + // start is wherever the iteration actually starts from; in left-to-right, that's often + // the same as beginning, but in RightToLeft it rarely is. pos = 0; return false; } @@ -344,6 +357,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: if (pos < textSpan.Length - 1 || ((uint)pos < (uint)textSpan.Length && textSpan[pos] != '\n')) { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. pos = 0; return false; } @@ -352,6 +367,8 @@ public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_End: if (pos < textSpan.Length) { + // If we're not currently at the end, we'll never be (we're iterating from end to beginning), + // so fail immediately. pos = 0; return false; } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs index 145d708d0c08a7..0a1ab218062634 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexLWCGCompiler.cs @@ -59,7 +59,7 @@ internal sealed class RegexLWCGCompiler : RegexCompiler EmitTryMatchAtCurrentPosition(); DynamicMethod scanMethod = DefineDynamicMethod($"Regex{regexNum}_Scan{description}", null, typeof(CompiledRegexRunner), new[] { typeof(RegexRunner), typeof(ReadOnlySpan) }); - EmitScan(tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); + EmitScan(options, tryfindNextPossibleStartPositionMethod, tryMatchAtCurrentPositionMethod); return new CompiledRegexRunnerFactory(scanMethod); } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 70d36507af96a2..3a0ead32a35d5b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -2558,39 +2558,46 @@ public int ChildCount() // there's no need to localize). internal bool SupportsCompilation([NotNullWhen(false)] out string? reason) { - if (!StackHelper.TryEnsureSufficientExecutionStack()) - { - reason = "run-time limits were exceeded"; - return false; - } - - // NonBacktracking isn't supported, nor RightToLeft. The latter applies to both the top-level - // options as well as when used to specify positive and negative lookbehinds. if ((Options & RegexOptions.NonBacktracking) != 0) { - reason = "RegexOptions.NonBacktracking was specified"; + reason = "RegexOptions.NonBacktracking isn't supported"; return false; } - if ((Options & RegexOptions.RightToLeft) != 0) + if (ExceedsMaxDepthAllowedDepth(this, allowedDepth: 40)) { - reason = "RegexOptions.RightToLeft or a positive/negative lookbehind was used"; + // For the source generator, deep RegexNode trees can result in emitting C# code that exceeds C# compiler + // limitations, leading to "CS8078: An expression is too long or complex to compile". As such, we place + // an artificial limit on max tree depth in order to mitigate such issues. The allowed depth can be tweaked + // as needed; its exceedingly rare to find expressions with such deep trees. And while RegexCompiler doesn't + // have to deal with C# compiler limitations, we still want to limit max tree depth as we want to limit + // how deep recursion we'll employ as part of code generation. + reason = "the expression may result exceeding run-time or compiler limits"; return false; } - int childCount = ChildCount(); - for (int i = 0; i < childCount; i++) + // Supported. + reason = null; + return true; + + static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth) { - // The node isn't supported if any of its children aren't supported. - if (!Child(i).SupportsCompilation(out reason)) + if (allowedDepth <= 0) { - return false; + return true; } - } - // Supported. - reason = null; - return true; + int childCount = node.ChildCount(); + for (int i = 0; i < childCount; i++) + { + if (ExceedsMaxDepthAllowedDepth(node.Child(i), allowedDepth - 1)) + { + return true; + } + } + + return false; + } } /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs index 4a7db7591490fc..51c16ba651e60f 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexTreeAnalyzer.cs @@ -23,8 +23,9 @@ static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByA return false; } - // Track whether we've seen any node with IgnoreCase set. + // Track whether we've seen any nodes with various options set. results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0; + results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0; if (isAtomicByAncestor) { @@ -149,6 +150,8 @@ internal sealed class AnalysisResults internal HashSet? _mayBacktrack; /// Whether any node has set. internal bool _hasIgnoreCase; + /// Whether any node has set. + internal bool _hasRightToLeft; /// Initializes the instance. /// The code being analyzed. @@ -158,9 +161,19 @@ internal sealed class AnalysisResults public RegexTree RegexTree { get; } /// Gets whether a node is considered atomic based on its ancestry. + /// + /// If the whole tree couldn't be examined, this returns false. That could lead to additional + /// code being output as nodes that could have been made atomic aren't, but functionally it's + /// the safe choice. + /// public bool IsAtomicByAncestor(RegexNode node) => _isAtomicByAncestor.Contains(node); /// Gets whether a node directly or indirectly contains captures. + /// + /// If the whole tree couldn't be examined, this returns true. That could lead to additional + /// code being emitted to deal with captures that can't occur, but functionally it's the + /// safe choice. + /// public bool MayContainCapture(RegexNode node) => !_complete || _containsCapture.Contains(node); /// Gets whether a node is or directory or indirectly contains a backtracking construct that isn't hidden by an internal atomic construct. @@ -168,13 +181,28 @@ internal sealed class AnalysisResults /// In most code generation situations, we only need to know after we emit the child code whether /// the child may backtrack, and that we can see with 100% certainty. This method is useful in situations /// where we need to predict without traversing the child at code generation time whether it may - /// incur backtracking. This method may have (few) false positives, but won't have any false negatives, + /// incur backtracking. This method may have (few) false positives (return true when it could have + /// returned false), but won't have any false negatives (return false when it should have returned true), /// meaning it might claim a node requires backtracking even if it doesn't, but it will always return - /// true for any node that requires backtracking. + /// true for any node that requires backtracking. In that vein, if the whole tree couldn't be examined, + /// this returns true. /// public bool MayBacktrack(RegexNode node) => !_complete || (_mayBacktrack?.Contains(node) ?? false); /// Gets whether a node might have set. - public bool HasIgnoreCase => _complete && _hasIgnoreCase; + /// + /// If the whole tree couldn't be examined, this returns true. That could lead to additional + /// code being emitted to support case-insensitivity in expressions that don't actually need + /// it, but functionally it's the safe choice. + /// + public bool HasIgnoreCase => !_complete || _hasIgnoreCase; + + /// Gets whether a node might have set. + /// + /// If the whole tree couldn't be examined, this returns true. That could lead to additional + /// code being emitted to support expressions that don't actually contain any RightToLeft + /// nodes, but functionally it's the safe choice. + /// + public bool HasRightToLeft => !_complete || _hasRightToLeft; } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs index 9df6db78c4e9a4..2375d8f57f2887 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs @@ -84,11 +84,36 @@ public static IEnumerable Match_MemberData() if (!RegexHelpers.IsNonBacktracking(engine)) { - // Zero-width negative lookahead assertion: Actual - "abc(?!XXX)\\w+" + // Zero-width negative lookahead assertion yield return (@"abc(?!XXX)\w+", "abcXXXdef", RegexOptions.None, 0, 9, false, string.Empty); - // Zero-width positive lookbehind assertion: Actual - "(\\w){6}(?<=XXX)def" + // Zero-width positive lookbehind assertion yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef"); + yield return (@"(?<=c)def", "123abcdef", RegexOptions.None, 0, 9, true, "def"); + yield return (@"(?<=abc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def"); + yield return (@"(?<=a\wc)def", "123abcdef", RegexOptions.None, 0, 9, true, "def"); + yield return (@"(?<=\ba\wc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def"); + yield return (@"(?<=.\ba\wc\B)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def"); + yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.None, 0, 10, true, "def"); + yield return (@"(?<=^123 abc)def", "123 abcdef", RegexOptions.Multiline, 0, 10, true, "def"); + yield return (@"(?<=123$\nabc)def", "123\nabcdef", RegexOptions.Multiline, 0, 10, true, "def"); + yield return (@"123(? diagnostics = await RegexGeneratorHelper.RunGenerator(@" - using System.Text.RegularExpressions; - partial class C - { - [RegexGenerator(""ab"", RegexOptions.RightToLeft)] - private static partial Regex RightToLeftNotSupported(); - } - "); - - Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); - } - [Fact] public async Task Diagnostic_NonBacktracking_LimitedSupport() { @@ -227,36 +212,6 @@ partial class C Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); } - [Fact] - public async Task Diagnostic_PositiveLookbehind_LimitedSupport() - { - IReadOnlyList diagnostics = await RegexGeneratorHelper.RunGenerator(@" - using System.Text.RegularExpressions; - partial class C - { - [RegexGenerator(""(?<=\b20)\d{2}\b"")] - private static partial Regex PositiveLookbehindNotSupported(); - } - "); - - Assert.Equal("SYSLIB1045", Assert.Single(diagnostics).Id); - } - - [Fact] - public async Task Diagnostic_NegativeLookbehind_LimitedSupport() - { - IReadOnlyList diagnostics = await RegexGeneratorHelper.RunGenerator(@" - using System.Text.RegularExpressions; - partial class C - { - [RegexGenerator(""(?