swiftlang · rctcwyvrn · Aug 3, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022
diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift
@@ -15,27 +15,39 @@
 @available(SwiftStdlib 5.7, *)
 public struct CharacterClass {
   internal var ccc: DSLTree.CustomCharacterClass
+  /// The builtin character class, if this CharacterClass is representable by one
+  internal var builtin: DSLTree.Atom.CharacterClass?
 
   init(_ ccc: DSLTree.CustomCharacterClass) {
     self.ccc = ccc
+    self.builtin = nil
   }
 
-  init(unconverted atom: DSLTree._AST.Atom) {
-    self.ccc = .init(members: [.atom(.unconverted(atom))])
+  init(builtin: DSLTree.Atom.CharacterClass) {
+    self.ccc = .init(members: [.atom(.characterClass(builtin))])
+    self.builtin = builtin
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass: RegexComponent {
   public var regex: Regex<Substring> {
-    _RegexFactory().customCharacterClass(ccc)
+    if let cc = builtin {
+      return _RegexFactory().characterClass(cc)
+    } else {
+      return _RegexFactory().customCharacterClass(ccc)
+    }
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass {
   public var inverted: CharacterClass {
-    CharacterClass(ccc.inverted)
+    if let inv = builtin?.inverted {
+      return CharacterClass(builtin: inv)
+    } else {
+      return CharacterClass(ccc.inverted)
+    }
   }
 }
 
@@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var anyGraphemeCluster: CharacterClass {
-    .init(unconverted: ._anyGrapheme)
+    .init(builtin: .anyGrapheme)
   }
 
   public static var whitespace: CharacterClass {
-    .init(unconverted: ._whitespace)
+    .init(builtin: .whitespace)
   }
 
   public static var digit: CharacterClass {
-    .init(unconverted: ._digit)
+    .init(builtin: .digit)
   }
 
   public static var hexDigit: CharacterClass {
@@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var horizontalWhitespace: CharacterClass {
-    .init(unconverted: ._horizontalWhitespace)
+    .init(builtin: .horizontalWhitespace)
   }
 
   public static var newlineSequence: CharacterClass {
-    .init(unconverted: ._newlineSequence)
+    .init(builtin: .newlineSequence)
   }
 
   public static var verticalWhitespace: CharacterClass {
-    .init(unconverted: ._verticalWhitespace)
+    .init(builtin: .verticalWhitespace)
   }
 
   public static var word: CharacterClass {
-    .init(unconverted: ._word)
+    .init(builtin: .word)
   }
 }
 

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen {
         emitMatchScalar(s)
       }
 
+    case let .characterClass(cc):
+      emitCharacterClass(cc)
+
     case let .assertion(kind):
       try emitAssertion(kind)
 
@@ -148,147 +151,27 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
-  mutating func emitStartOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-        (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.lowerBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[input.index(before: pos)].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
-      }
-    }
-  }
-
-  mutating func emitEndOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-      (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.upperBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[pos].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[pos].isNewline
-      }
-    }
-  }
-
   mutating func emitAssertion(
     _ kind: DSLTree.Atom.Assertion
   ) throws {
-    // FIXME: Depends on API model we have... We may want to
-    // think through some of these with API interactions in mind
-    //
-    // This might break how we use `bounds` for both slicing
-    // and things like `firstIndex`, that is `firstIndex` may
-    // need to supply both a slice bounds and a per-search bounds.
-    switch kind {
-    case .startOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.lowerBound
-      }
-
-    case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel]
-          (_, _, input, pos, subjectBounds) in
-        if pos == subjectBounds.upperBound { return true }
-        switch semanticLevel {
-        case .graphemeCluster:
-          return input.index(after: pos) == subjectBounds.upperBound
-           && input[pos].isNewline
-        case .unicodeScalar:
-          return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
-           && input.unicodeScalars[pos].isNewline
-        }
-      }
-
-    case .endOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.upperBound
-      }
-
-    case .resetStartOfMatch:
-      // FIXME: Figure out how to communicate this out
+    if kind == .resetStartOfMatch {
       throw Unsupported(#"\K (reset/keep assertion)"#)
-
-    case .firstMatchingPositionInSubject:
-      // TODO: We can probably build a nice model with API here
-
-      // FIXME: This needs to be based on `searchBounds`,
-      // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
-
-    case .textSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .notTextSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        !input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .startOfLine:
-      emitStartOfLine()
-
-    case .endOfLine:
-      emitEndOfLine()
-
-    case .caretAnchor:
-      if options.anchorsMatchNewlines {
-        emitStartOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.lowerBound
-        }
-      }
-
-    case .dollarAnchor:
-      if options.anchorsMatchNewlines {
-        emitEndOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.upperBound
-        }
-      }
-
-    case .wordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return _CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
-
-    case .notWordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return !_CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
     }
+    builder.buildAssert(
+      by: kind,
+      options.anchorsMatchNewlines,
+      options.usesSimpleUnicodeBoundaries,
+      options.usesASCIIWord,
+      options.semanticLevel)
   }
-
+
+  mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
+    builder.buildMatchBuiltin(
+      cc.model,
+      cc.model.isStrictAscii(options: options),
+      isScalar: options.semanticLevel == .unicodeScalar)
+  }
+
   mutating func emitMatchScalar(_ s: UnicodeScalar) {
     assert(options.semanticLevel == .unicodeScalar)
     if options.isCaseInsensitive && s.properties.isCased {
@@ -591,6 +474,10 @@ fileprivate extension Compiler.ByteCodeGen {
     let minTrips = low
     assert((extraTrips ?? 1) >= 0)
 
+    if tryEmitFastQuant(child, updatedKind, minTrips, extraTrips) {
+      return
+    }
+
     // The below is a general algorithm for bounded and unbounded
     // quantification. It can be specialized when the min
     // is 0 or 1, or when extra trips is 1 or unbounded.
@@ -775,6 +662,83 @@ fileprivate extension Compiler.ByteCodeGen {
     builder.label(exit)
   }
 
+  /// Specialized quantification instruction for repetition of certain nodes in grapheme semantic mode
+  /// Allowed nodes are:
+  /// - single ascii scalar .char
+  /// - ascii .customCharacterClass
+  /// - single grapheme consumgin built in character classes
+  /// - .any, .anyNonNewline, .dot
+  mutating func tryEmitFastQuant(
+    _ child: DSLTree.Node,
+    _ kind: AST.Quantification.Kind,
+    _ minTrips: Int,
+    _ extraTrips: Int?
+  ) -> Bool {
+    guard optimizationsEnabled
+            && minTrips <= QuantifyPayload.maxStorableTrips
+            && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips
+            && options.matchLevel == .graphemeCluster
+            && kind != .reluctant else {
+      return false
+    }
+
+    switch child {
+    case .customCharacterClass(let ccc):
+      // ascii only custom character class
+      guard let bitset = ccc.asAsciiBitset(options) else {
+        return false
+      }
+      builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips)
+
+    case .atom(let atom):
+      switch atom {
+      case .char(let c):
+        // Single scalar ascii value character
+        guard let val = c._singleScalarAsciiValue else {
+          return false
+        }
+        builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips)
+
+      case .any:
+        builder.buildQuantifyAny(
+          matchesNewlines: true, kind, minTrips, extraTrips)
+      case .anyNonNewline:
+        builder.buildQuantifyAny(
+          matchesNewlines: false, kind, minTrips, extraTrips)
+      case .dot:
+        builder.buildQuantifyAny(
+          matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips)
+
+      case .characterClass(let cc):
+        // Custom character class that consumes a single grapheme
+        let model = cc.model
+        guard model.consumesSingleGrapheme else {
+          return false
+        }
+        builder.buildQuantify(
+          builtin: model.cc,
+          isStrict: model.isStrictAscii(options: options),
+          isInverted: model.isInverted,
+          kind,
+          minTrips,
+          extraTrips)
+      default:
+        return false
+      }
+    case .convertedRegexLiteral(let node, _):
+      return tryEmitFastQuant(node, kind, minTrips, extraTrips)
+    case .nonCapturingGroup(let groupKind, let node):
+      // .nonCapture nonCapturingGroups are ignored during compilation
+      guard groupKind.ast == .nonCapture else {
+        return false
+      }
+      return tryEmitFastQuant(node, kind, minTrips, extraTrips)
+    default:
+      return false
+    }
+    return true
+  }
+
   mutating func emitCustomCharacterClass(
     _ ccc: DSLTree.CustomCharacterClass
   ) throws {
@@ -785,10 +749,10 @@ fileprivate extension Compiler.ByteCodeGen {
       } else {
         builder.buildMatchAsciiBitset(asciiBitset)
       }
-    } else {
-      let consumer = try ccc.generateConsumer(options)
-      builder.buildConsume(by: consumer)
+      return
     }
+    let consumer = try ccc.generateConsumer(options)
+    builder.buildConsume(by: consumer)
   }
 
   @discardableResult