Skip to content

Commit

Permalink
[Optimization] Specialized quantification instruction (#577)
Browse files Browse the repository at this point in the history
Implements a specialized quantification instruction for repeated matching of a character, dot, character class, or custom character class
  • Loading branch information
rctcwyvrn authored Aug 3, 2022
1 parent 405fbcb commit 1acca94
Show file tree
Hide file tree
Showing 13 changed files with 598 additions and 40 deletions.
78 changes: 78 additions & 0 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,10 @@ fileprivate extension Compiler.ByteCodeGen {
let minTrips = low
assert((extraTrips ?? 1) >= 0)

if tryEmitFastQuant(child, updatedKind, minTrips, extraTrips) {
return
}

// The below is a general algorithm for bounded and unbounded
// quantification. It can be specialized when the min
// is 0 or 1, or when extra trips is 1 or unbounded.
Expand Down Expand Up @@ -655,6 +659,80 @@ fileprivate extension Compiler.ByteCodeGen {
builder.label(exit)
}

/// Specialized quantification instruction for repetition of certain nodes in grapheme semantic mode
/// Allowed nodes are:
/// - single ascii scalar .char
/// - ascii .customCharacterClass
/// - single grapheme consumgin built in character classes
/// - .any, .anyNonNewline, .dot
mutating func tryEmitFastQuant(
_ child: DSLTree.Node,
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?
) -> Bool {
guard optimizationsEnabled
&& minTrips <= QuantifyPayload.maxStorableTrips
&& extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips
&& options.semanticLevel == .graphemeCluster
&& kind != .reluctant else {
return false
}
switch child {
case .customCharacterClass(let ccc):
// ascii only custom character class
guard let bitset = ccc.asAsciiBitset(options) else {
return false
}
builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips)

case .atom(let atom):
switch atom {
case .char(let c):
// Single scalar ascii value character
guard let val = c._singleScalarAsciiValue else {
return false
}
builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips)

case .any:
builder.buildQuantifyAny(
matchesNewlines: true, kind, minTrips, extraTrips)
case .anyNonNewline:
builder.buildQuantifyAny(
matchesNewlines: false, kind, minTrips, extraTrips)
case .dot:
builder.buildQuantifyAny(
matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips)

case .characterClass(let cc):
// Custom character class that consumes a single grapheme
let model = cc.asRuntimeModel(options)
guard model.consumesSingleGrapheme else {
return false
}
builder.buildQuantify(
model: model,
kind,
minTrips,
extraTrips)
default:
return false
}
case .convertedRegexLiteral(let node, _):
return tryEmitFastQuant(node, kind, minTrips, extraTrips)
case .nonCapturingGroup(let groupKind, let node):
// .nonCapture nonCapturingGroups are ignored during compilation
guard groupKind.ast == .nonCapture else {
return false
}
return tryEmitFastQuant(node, kind, minTrips, extraTrips)
default:
return false
}
return true
}

/// Coalesce any adjacent scalar members in a custom character class together.
/// This is required in order to produce correct grapheme matching behavior.
func coalescingCustomCharacterClassMembers(
Expand Down
53 changes: 47 additions & 6 deletions Sources/_StringProcessing/Engine/Backtracking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@
//===----------------------------------------------------------------------===//

extension Processor {

// TODO: What all do we want to save? Configurable?
// TODO: Do we need to save any registers?
// TODO: Is this the right place to do function stack unwinding?
struct SavePoint {
var pc: InstructionAddress
var pos: Position?

// Quantifiers may store a range of positions to restore to
var rangeStart: Position?
var rangeEnd: Position?
// The end of the call stack, so we can slice it off
// when failing inside a call.
//
Expand All @@ -43,7 +41,35 @@ extension Processor {
intRegisters: [Int],
PositionRegister: [Input.Index]
) {
(pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
}

var rangeIsEmpty: Bool { rangeEnd == nil }

mutating func updateRange(newEnd: Input.Index) {
if rangeStart == nil {
rangeStart = newEnd
}
rangeEnd = newEnd
}

/// Move the next range position into pos, and removing it from the range
mutating func takePositionFromRange(_ input: Input) {
assert(!rangeIsEmpty)
pos = rangeEnd!
shrinkRange(input)
}

/// Shrink the range of the save point by one index, essentially dropping the last index
mutating func shrinkRange(_ input: Input) {
assert(!rangeIsEmpty)
if rangeEnd == rangeStart {
// The range is now empty
rangeStart = nil
rangeEnd = nil
} else {
input.formIndex(before: &rangeEnd!)
}
}
}

Expand All @@ -54,6 +80,21 @@ extension Processor {
SavePoint(
pc: pc,
pos: addressOnly ? nil : currentPosition,
rangeStart: nil,
rangeEnd: nil,
stackEnd: .init(callStack.count),
captureEnds: storedCaptures,
intRegisters: registers.ints,
posRegisters: registers.positions)
}

func startQuantifierSavePoint() -> SavePoint {
// Restores to the instruction AFTER the current quantifier instruction
SavePoint(
pc: controller.pc + 1,
pos: nil,
rangeStart: nil,
rangeEnd: nil,
stackEnd: .init(callStack.count),
captureEnds: storedCaptures,
intRegisters: registers.ints,
Expand Down
164 changes: 163 additions & 1 deletion Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
//
//===----------------------------------------------------------------------===//

@_implementationOnly import _RegexParser

extension Instruction {
/// An instruction's payload packs operands and destination
/// registers.
Expand Down Expand Up @@ -330,7 +332,9 @@ extension Instruction.Payload {
) {
interpretPair()
}

// MARK: Struct payloads

init(_ model: _CharacterClassModel) {
self.init(CharacterClassPayload(model).rawValue)
}
Expand All @@ -342,11 +346,169 @@ extension Instruction.Payload {
self.init(rawValue: payload.rawValue)
}
var assertion: AssertionPayload {
AssertionPayload.init(rawValue: self.rawValue & _payloadMask)
AssertionPayload.init(rawValue: rawValue & _payloadMask)
}
init(quantify: QuantifyPayload) {
self.init(quantify.rawValue)
}
var quantify: QuantifyPayload {
return QuantifyPayload(rawValue: rawValue & _payloadMask)
}
}

// MARK: Struct definitions
struct QuantifyPayload: RawRepresentable {
let rawValue: UInt64
enum PayloadType: UInt64 {
case bitset = 0
case asciiChar = 1
case any = 2
case builtin = 4
}

// Future work: optimize this layout -> payload type should be a fast switch
// The top 8 bits are reserved for the opcode so we have 56 bits to work with
// b55-b38 - Unused
// b38-b35 - Payload type (one of 4 types, stored on 3 bits)
// b35-b27 - minTrips (8 bit int)
// b27-b18 - extraTrips (8 bit value, one bit for nil)
// b18-b16 - Quantification type (one of three types)
// b16-b0 - Payload value (depends on payload type)
static var quantKindShift: UInt64 { 16 }
static var extraTripsShift: UInt64 { 18 }
static var minTripsShift: UInt64 { 27 }
static var typeShift: UInt64 { 35 }
static var maxStorableTrips: UInt64 { (1 << 8) - 1 }

var quantKindMask: UInt64 { 3 }
var extraTripsMask: UInt64 { 0x1FF }
var minTripsMask: UInt64 { 0xFF }
var typeMask: UInt64 { 7 }
var payloadMask: UInt64 { 0xFF_FF }

static func packInfoValues(
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?,
_ type: PayloadType
) -> UInt64 {
let kindVal: UInt64
switch kind {
case .eager:
kindVal = 0
case .reluctant:
kindVal = 1
case .possessive:
kindVal = 2
}
let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1
return (kindVal << QuantifyPayload.quantKindShift) +
(extraTripsVal << QuantifyPayload.extraTripsShift) +
(UInt64(minTrips) << QuantifyPayload.minTripsShift) +
(type.rawValue << QuantifyPayload.typeShift)
}

init(rawValue: UInt64) {
self.rawValue = rawValue
assert(rawValue & _opcodeMask == 0)
}

init(
bitset: AsciiBitsetRegister,
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?
) {
assert(bitset.bits <= _payloadMask)
self.rawValue = bitset.bits
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset)
}

init(
asciiChar: UInt8,
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?
) {
self.rawValue = UInt64(asciiChar)
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar)
}

init(
matchesNewlines: Bool,
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?
) {
self.rawValue = (matchesNewlines ? 1 : 0)
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any)
}

init(
model: _CharacterClassModel,
_ kind: AST.Quantification.Kind,
_ minTrips: Int,
_ extraTrips: Int?
) {
assert(model.cc.rawValue < 0xFF)
assert(model.matchLevel != .unicodeScalar)
let packedModel = model.cc.rawValue
+ (model.isInverted ? 1 << 9 : 0)
+ (model.isStrictASCII ? 1 << 10 : 0)
self.rawValue = packedModel
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin)
}

var type: PayloadType {
PayloadType(rawValue: (self.rawValue >> QuantifyPayload.typeShift) & 7)!
}

var quantKind: AST.Quantification.Kind {
switch (self.rawValue >> QuantifyPayload.quantKindShift) & quantKindMask {
case 0: return .eager
case 1: return .reluctant
case 2: return .possessive
default:
fatalError("Unreachable")
}
}

var minTrips: UInt64 {
(self.rawValue >> QuantifyPayload.minTripsShift) & minTripsMask
}

var extraTrips: UInt64? {
let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & extraTripsMask
if val == 1 {
return nil
} else {
return val >> 1
}
}

var bitset: AsciiBitsetRegister {
TypedInt(self.rawValue & payloadMask)
}

var asciiChar: UInt8 {
UInt8(asserting: self.rawValue & payloadMask)
}

var anyMatchesNewline: Bool {
(self.rawValue & 1) == 1
}

var builtin: _CharacterClassModel.Representation {
_CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)!
}
var builtinIsInverted: Bool {
(self.rawValue >> 9) & 1 == 1
}
var builtinIsStrict: Bool {
(self.rawValue >> 10) & 1 == 1
}
}

struct CharacterClassPayload: RawRepresentable {
let rawValue: UInt64
// Layout:
Expand Down
7 changes: 7 additions & 0 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ extension Instruction {
///
case splitSaving

/// Fused quantify, execute, save instruction
/// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor
/// Only quantifies specific nodes
///
/// quantify(_:QuantifyPayload)
///
case quantify
/// Begin the given capture
///
/// beginCapture(_:CapReg)
Expand Down
Loading

0 comments on commit 1acca94

Please sign in to comment.