From 6af8b243c90a01631a99b7e2722a89c0a5bbc8ae Mon Sep 17 00:00:00 2001 From: Jose Perez Rodriguez Date: Tue, 30 Nov 2021 20:55:59 -0800 Subject: [PATCH] Have RegexInterpreter work over ReadOnlySpan instead of strings. (#62165) --- .../RegexFindOptimizations.cs | 56 ++++++++++--------- .../RegularExpressions/RegexInterpreter.cs | 55 +++++++++--------- 2 files changed, 57 insertions(+), 54 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index f1b285818e93e9..de2fb1619e0cb0 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -181,13 +181,13 @@ public RegexFindOptimizations(RegexTree tree, CultureInfo culture) public List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? FixedDistanceSets { get; } /// Try to advance to the next starting position that might be a location for a match. - /// The text to search. - /// The position in . This is updated with the found position. - /// The index in to consider the beginning for beginning anchor purposes. - /// The index in to consider the start for start anchor purposes. - /// The index in to consider the non-inclusive end of the string. + /// The text to search. + /// The position in . This is updated with the found position. + /// The index in to consider the beginning for beginning anchor purposes. + /// The index in to consider the start for start anchor purposes. + /// The index in to consider the non-inclusive end of the string. /// true if a position to attempt a match was found; false if none was found. - public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, int start, int end) + public bool TryFindNextStartingPosition(ReadOnlySpan textSpan, ref int pos, int beginning, int start, int end) { // Return early if we know there's not enough input left to match. if (!_rightToLeft) @@ -217,16 +217,16 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, // the beginning of the string or just after a line feed), find the next // newline and position just after it. Debug.Assert(!_rightToLeft); - if (pos > beginning && text[pos - 1] != '\n') + if (pos > beginning && textSpan[pos - 1] != '\n') { - int newline = text.IndexOf('\n', pos); - if (newline == -1 || newline + 1 > end) + int newline = textSpan.Slice(pos).IndexOf('\n'); + if (newline == -1 || newline + 1 + pos > end) { pos = end; return false; } - pos = newline + 1; + pos = newline + 1 + pos; } } @@ -281,7 +281,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, return true; case FindNextStartingPositionMode.LeadingAnchor_RightToLeft_EndZ: - if (pos < end - 1 || (pos == end - 1 && text[pos] != '\n')) + if (pos < end - 1 || (pos == end - 1 && textSpan[pos] != '\n')) { pos = beginning; return false; @@ -300,7 +300,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: { - int i = text.AsSpan(pos, end - pos).IndexOf(LeadingCaseSensitivePrefix.AsSpan()); + int i = textSpan.Slice(pos, end - pos).IndexOf(LeadingCaseSensitivePrefix.AsSpan()); if (i >= 0) { pos += i; @@ -313,7 +313,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, case FindNextStartingPositionMode.LeadingPrefix_RightToLeft_CaseSensitive: { - int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(LeadingCaseSensitivePrefix.AsSpan()); + int i = textSpan.Slice(beginning, pos - beginning).LastIndexOf(LeadingCaseSensitivePrefix.AsSpan()); if (i >= 0) { pos = beginning + i + LeadingCaseSensitivePrefix.Length; @@ -328,7 +328,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, case FindNextStartingPositionMode.LeadingLiteral_RightToLeft_CaseSensitive: { - int i = text.AsSpan(beginning, pos - beginning).LastIndexOf(FixedDistanceLiteral.Literal); + int i = textSpan.Slice(beginning, pos - beginning).LastIndexOf(FixedDistanceLiteral.Literal); if (i >= 0) { pos = beginning + i + 1; @@ -344,7 +344,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, char ch = FixedDistanceLiteral.Literal; TextInfo ti = _textInfo; - ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + ReadOnlySpan span = textSpan.Slice(beginning, pos - beginning); for (int i = span.Length - 1; i >= 0; i--) { if (ti.ToLower(span[i]) == ch) @@ -364,7 +364,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, { (char[]? chars, string set, _, _) = FixedDistanceSets![0]; - ReadOnlySpan span = text.AsSpan(pos, end - pos); + ReadOnlySpan span = textSpan.Slice(pos, end - pos); if (chars is not null) { int i = span.IndexOfAny(chars); @@ -397,7 +397,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, string set = FixedDistanceSets![0].Set; TextInfo ti = _textInfo; - ReadOnlySpan span = text.AsSpan(pos, end - pos); + ReadOnlySpan span = textSpan.Slice(pos, end - pos); for (int i = 0; i < span.Length; i++) { if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) @@ -416,7 +416,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, ref uint[]? startingAsciiLookup = ref _asciiLookups![0]; string set = FixedDistanceSets![0].Set; - ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + ReadOnlySpan span = textSpan.Slice(beginning, pos - beginning); for (int i = span.Length - 1; i >= 0; i--) { if (RegexCharClass.CharInClass(span[i], set, ref startingAsciiLookup)) @@ -436,7 +436,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, string set = FixedDistanceSets![0].Set; TextInfo ti = _textInfo; - ReadOnlySpan span = text.AsSpan(beginning, pos - beginning); + ReadOnlySpan span = textSpan.Slice(beginning, pos - beginning); for (int i = span.Length - 1; i >= 0; i--) { if (RegexCharClass.CharInClass(ti.ToLower(span[i]), set, ref startingAsciiLookup)) @@ -456,7 +456,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, { Debug.Assert(FixedDistanceLiteral.Distance <= _minRequiredLength); - int i = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); + int i = textSpan.Slice(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance).IndexOf(FixedDistanceLiteral.Literal); if (i >= 0) { pos += i; @@ -474,7 +474,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, char ch = FixedDistanceLiteral.Literal; TextInfo ti = _textInfo; - ReadOnlySpan span = text.AsSpan(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance); + ReadOnlySpan span = textSpan.Slice(pos + FixedDistanceLiteral.Distance, end - pos - FixedDistanceLiteral.Distance); for (int i = 0; i < span.Length; i++) { if (ti.ToLower(span[i]) == ch) @@ -501,12 +501,14 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { int offset = inputPosition + primaryDistance; - int index = text.IndexOfAny(primaryChars, offset, end - offset); + int index = textSpan.Slice(offset, end - offset).IndexOfAny(primaryChars); if (index < 0) { break; } + index += offset; // The index here will be offset indexed due to the use of span, so we add offset to get + // real position on the string. inputPosition = index - primaryDistance; if (inputPosition > endMinusRequiredLength) { @@ -516,7 +518,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int i = 1; i < sets.Count; i++) { (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; - char c = text[inputPosition + nextDistance]; + char c = textSpan[inputPosition + nextDistance]; if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) { goto Bumpalong; @@ -535,7 +537,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { - char c = text[inputPosition + primaryDistance]; + char c = textSpan[inputPosition + primaryDistance]; if (!RegexCharClass.CharInClass(c, primarySet, ref startingAsciiLookup)) { goto Bumpalong; @@ -544,7 +546,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int i = 1; i < sets.Count; i++) { (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; - c = text[inputPosition + nextDistance]; + c = textSpan[inputPosition + nextDistance]; if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) { goto Bumpalong; @@ -573,7 +575,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++) { - char c = text[inputPosition + primaryDistance]; + char c = textSpan[inputPosition + primaryDistance]; if (!RegexCharClass.CharInClass(ti.ToLower(c), primarySet, ref startingAsciiLookup)) { goto Bumpalong; @@ -582,7 +584,7 @@ public bool TryFindNextStartingPosition(string text, ref int pos, int beginning, for (int i = 1; i < sets.Count; i++) { (_, string nextSet, int nextDistance, bool nextCaseInsensitive) = sets[i]; - c = text[inputPosition + nextDistance]; + c = textSpan[inputPosition + nextDistance]; if (!RegexCharClass.CharInClass(nextCaseInsensitive ? _textInfo.ToLower(c) : c, nextSet, ref _asciiLookups![i])) { goto Bumpalong; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs index 4351473b96fdb0..5c28f58a168a25 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs @@ -207,14 +207,14 @@ private void StackPush(int i1, int i2) private int Forwardchars() => _rightToLeft ? runtextpos - runtextbeg : runtextend - runtextpos; - private char Forwardcharnext() + private char Forwardcharnext(ReadOnlySpan runtextSpan) { - char ch = _rightToLeft ? runtext![--runtextpos] : runtext![runtextpos++]; - + int i = _rightToLeft ? --runtextpos : runtextpos++; + char ch = runtextSpan[i]; return _caseInsensitive ? _textInfo.ToLower(ch) : ch; } - private bool MatchString(string str) + private bool MatchString(string str, ReadOnlySpan runtextSpan) { int c = str.Length; int pos; @@ -242,7 +242,7 @@ private bool MatchString(string str) { while (c != 0) { - if (str[--c] != runtext![--pos]) + if (str[--c] != runtextSpan[--pos]) { return false; } @@ -253,7 +253,7 @@ private bool MatchString(string str) TextInfo ti = _textInfo; while (c != 0) { - if (str[--c] != ti.ToLower(runtext![--pos])) + if (str[--c] != ti.ToLower(runtextSpan[--pos])) { return false; } @@ -270,7 +270,7 @@ private bool MatchString(string str) return true; } - private bool MatchRef(int index, int length) + private bool MatchRef(int index, int length, ReadOnlySpan runtextSpan) { int pos; if (!_rightToLeft) @@ -299,7 +299,7 @@ private bool MatchRef(int index, int length) { while (c-- != 0) { - if (runtext![--cmpos] != runtext[--pos]) + if (runtextSpan[--cmpos] != runtextSpan[--pos]) { return false; } @@ -310,7 +310,7 @@ private bool MatchRef(int index, int length) TextInfo ti = _textInfo; while (c-- != 0) { - if (ti.ToLower(runtext![--cmpos]) != ti.ToLower(runtext[--pos])) + if (ti.ToLower(runtextSpan[--cmpos]) != ti.ToLower(runtextSpan[--pos])) { return false; } @@ -337,6 +337,7 @@ protected override void Go() SetOperator(_code.Codes[0]); _codepos = 0; int advance = -1; + ReadOnlySpan runtextSpan = runtext; while (true) { @@ -699,7 +700,7 @@ protected override void Go() break; case RegexCode.Bol: - if (Leftchars() > 0 && runtext![runtextpos - 1] != '\n') + if (Leftchars() > 0 && runtextSpan[runtextpos - 1] != '\n') { break; } @@ -707,7 +708,7 @@ protected override void Go() continue; case RegexCode.Eol: - if (Rightchars() > 0 && runtext![runtextpos] != '\n') + if (Rightchars() > 0 && runtextSpan[runtextpos] != '\n') { break; } @@ -763,7 +764,7 @@ protected override void Go() continue; case RegexCode.EndZ: - if (Rightchars() > 1 || Rightchars() == 1 && runtext![runtextpos] != '\n') + if (Rightchars() > 1 || Rightchars() == 1 && runtextSpan[runtextpos] != '\n') { break; } @@ -779,7 +780,7 @@ protected override void Go() continue; case RegexCode.One: - if (Forwardchars() < 1 || Forwardcharnext() != (char)Operand(0)) + if (Forwardchars() < 1 || Forwardcharnext(runtextSpan) != (char)Operand(0)) { break; } @@ -787,7 +788,7 @@ protected override void Go() continue; case RegexCode.Notone: - if (Forwardchars() < 1 || Forwardcharnext() == (char)Operand(0)) + if (Forwardchars() < 1 || Forwardcharnext(runtextSpan) == (char)Operand(0)) { break; } @@ -802,7 +803,7 @@ protected override void Go() else { int operand = Operand(0); - if (!RegexCharClass.CharInClass(Forwardcharnext(), _code.Strings[operand], ref _code.StringsAsciiLookup[operand])) + if (!RegexCharClass.CharInClass(Forwardcharnext(runtextSpan), _code.Strings[operand], ref _code.StringsAsciiLookup[operand])) { break; } @@ -811,7 +812,7 @@ protected override void Go() continue; case RegexCode.Multi: - if (!MatchString(_code.Strings[Operand(0)])) + if (!MatchString(_code.Strings[Operand(0)], runtextSpan)) { break; } @@ -823,7 +824,7 @@ protected override void Go() int capnum = Operand(0); if (IsMatched(capnum)) { - if (!MatchRef(MatchIndex(capnum), MatchLength(capnum))) + if (!MatchRef(MatchIndex(capnum), MatchLength(capnum), runtextSpan)) { break; } @@ -850,7 +851,7 @@ protected override void Go() char ch = (char)Operand(0); while (c-- > 0) { - if (Forwardcharnext() != ch) + if (Forwardcharnext(runtextSpan) != ch) { goto BreakBackward; } @@ -870,7 +871,7 @@ protected override void Go() char ch = (char)Operand(0); while (c-- > 0) { - if (Forwardcharnext() == ch) + if (Forwardcharnext(runtextSpan) == ch) { goto BreakBackward; } @@ -899,7 +900,7 @@ protected override void Go() CheckTimeout(); } - if (!RegexCharClass.CharInClass(Forwardcharnext(), set, ref setLookup)) + if (!RegexCharClass.CharInClass(Forwardcharnext(runtextSpan), set, ref setLookup)) { goto BreakBackward; } @@ -917,7 +918,7 @@ protected override void Go() for (i = len; i > 0; i--) { - if (Forwardcharnext() != ch) + if (Forwardcharnext(runtextSpan) != ch) { Backwardnext(); break; @@ -943,7 +944,7 @@ protected override void Go() { // We're left-to-right and case-sensitive, so we can employ the vectorized IndexOf // to search for the character. - i = runtext!.AsSpan(runtextpos, len).IndexOf(ch); + i = runtextSpan.Slice(runtextpos, len).IndexOf(ch); if (i == -1) { runtextpos += len; @@ -959,7 +960,7 @@ protected override void Go() { for (i = len; i > 0; i--) { - if (Forwardcharnext() == ch) + if (Forwardcharnext(runtextSpan) == ch) { Backwardnext(); break; @@ -992,7 +993,7 @@ protected override void Go() CheckTimeout(); } - if (!RegexCharClass.CharInClass(Forwardcharnext(), set, ref setLookup)) + if (!RegexCharClass.CharInClass(Forwardcharnext(runtextSpan), set, ref setLookup)) { Backwardnext(); break; @@ -1042,7 +1043,7 @@ protected override void Go() int pos = TrackPeek(1); runtextpos = pos; - if (Forwardcharnext() != (char)Operand(0)) + if (Forwardcharnext(runtextSpan) != (char)Operand(0)) { break; } @@ -1062,7 +1063,7 @@ protected override void Go() int pos = TrackPeek(1); runtextpos = pos; - if (Forwardcharnext() == (char)Operand(0)) + if (Forwardcharnext(runtextSpan) == (char)Operand(0)) { break; } @@ -1083,7 +1084,7 @@ protected override void Go() runtextpos = pos; int operand0 = Operand(0); - if (!RegexCharClass.CharInClass(Forwardcharnext(), _code.Strings[operand0], ref _code.StringsAsciiLookup[operand0])) + if (!RegexCharClass.CharInClass(Forwardcharnext(runtextSpan), _code.Strings[operand0], ref _code.StringsAsciiLookup[operand0])) { break; }