From 33acdeb6bf05d68cf0ab78bd0ff453d2d54a4cad Mon Sep 17 00:00:00 2001 From: Lily Date: Mon, 11 Jul 2022 13:59:21 -0700 Subject: [PATCH] Break out of quantification loop if there is no forward progress (#560) This fixes infinite loops when we loop over an internal node that does not have any forward progress. Also included is an optimization to only emit the check/break instructions if we have a case that might result in an infinite loop (possibly non-progressing inner node + unlimited quantification) --- Sources/_StringProcessing/ByteCodeGen.swift | 65 +++++++++++++++++++ .../Engine/Backtracking.swift | 10 ++- .../Engine/InstPayload.swift | 6 +- .../Engine/Instruction.swift | 18 +++++ .../_StringProcessing/Engine/MEBuilder.swift | 23 ++++++- .../_StringProcessing/Engine/Processor.swift | 16 ++++- .../_StringProcessing/Engine/Registers.swift | 14 ++++ Tests/RegexTests/CompileTests.swift | 36 ++++++++++ Tests/RegexTests/MatchTests.swift | 28 +++++++- 9 files changed, 205 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..5eb38518e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -543,7 +543,12 @@ fileprivate extension Compiler.ByteCodeGen { decrement %minTrips and fallthrough loop-body: + : + mov currentPosition %pos evaluate the subexpression + : + if %pos is currentPosition: + goto exit goto min-trip-count control block exit-policy control block: @@ -646,7 +651,28 @@ fileprivate extension Compiler.ByteCodeGen { // // branch min-trip-count builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + let emitPositionChecking = + (!optimizationsEnabled || !child.guaranteesForwardProgress) && + extraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } try emitNode(child) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or extraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + if minTrips <= 1 { // fallthrough } else { @@ -832,3 +858,42 @@ fileprivate extension Compiler.ByteCodeGen { return nil } } + +extension DSLTree.Node { + var guaranteesForwardProgress: Bool { + switch self { + case .orderedChoice(let children): + return children.allSatisfy { $0.guaranteesForwardProgress } + case .concatenation(let children): + return children.contains(where: { $0.guaranteesForwardProgress }) + case .capture(_, _, let node, _): + return node.guaranteesForwardProgress + case .nonCapturingGroup(let kind, let child): + switch kind.ast { + case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: + return false + default: return child.guaranteesForwardProgress + } + case .atom(let atom): + switch atom { + case .changeMatchingOptions, .assertion: return false + default: return true + } + case .trivia, .empty: + return false + case .quotedLiteral(let string): + return !string.isEmpty + case .convertedRegexLiteral(let node, _): + return node.guaranteesForwardProgress + case .consumer, .matcher: + // Allow zero width consumers and matchers + return false + case .customCharacterClass: + return true + case .quantification(let amount, _, let child): + let (atLeast, _) = amount.ast.bounds + return atLeast ?? 0 > 0 && child.guaranteesForwardProgress + default: return false + } + } +} diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 8fcdf9312..355702ac1 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -32,15 +32,18 @@ extension Processor { // The int registers store values that can be relevant to // backtracking, such as the number of trips in a quantification. var intRegisters: [Int] + // Same with position registers + var posRegisters: [Input.Index] var destructure: ( pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, captureEnds: [_StoredCapture], - intRegisters: [Int] + intRegisters: [Int], + PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters) + (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } @@ -53,7 +56,8 @@ extension Processor { pos: addressOnly ? nil : currentPosition, stackEnd: .init(callStack.count), captureEnds: storedCaptures, - intRegisters: registers.ints) + intRegisters: registers.ints, + posRegisters: registers.positions) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..21c647a3b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -284,10 +284,10 @@ extension Instruction.Payload { interpretPair() } - init(pos: PositionRegister, pos2: PositionRegister) { - self.init(pos, pos2) + init(addr: InstructionAddress, position: PositionRegister) { + self.init(addr, position) } - var pairedPosPos: (PositionRegister, PositionRegister) { + var pairedAddrPos: (InstructionAddress, PositionRegister) { interpretPair() } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..4cc810138 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -37,6 +37,14 @@ extension Instruction { /// case moveImmediate + /// Move the current position into a register + /// + /// moveCurrentPosition(into: PositionRegister) + /// + /// Operands: + /// - Position register to move into + case moveCurrentPosition + // MARK: General Purpose: Control flow /// Branch to a new instruction @@ -57,6 +65,16 @@ extension Instruction { /// case condBranchZeroElseDecrement + /// Conditionally branch if the current position is the same as the register + /// + /// condBranch( + /// to: InstAddr, ifSamePositionAs: PositionRegister) + /// + /// Operands: + /// - Instruction address to branch to, if the position in the register is the same as currentPosition + /// - Position register to check against + case condBranchSamePosition + // TODO: Function calls // MARK: - Matching diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..84b80489f 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -32,6 +32,7 @@ extension MEProgram { var nextIntRegister = IntRegister(0) var nextCaptureRegister = CaptureRegister(0) var nextValueRegister = ValueRegister(0) + var nextPositionRegister = PositionRegister(0) // Special addresses or instructions var failAddressToken: AddressToken? = nil @@ -105,6 +106,14 @@ extension MEProgram.Builder { fixup(to: t) } + mutating func buildCondBranch( + to t: AddressToken, + ifSamePositionAs r: PositionRegister + ) { + instructions.append(.init(.condBranchSamePosition, .init(position: r))) + fixup(to: t) + } + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) @@ -211,6 +220,10 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } + mutating func buildMoveCurrentPosition(into r: PositionRegister) { + instructions.append(.init(.moveCurrentPosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister ) { @@ -257,7 +270,8 @@ extension MEProgram.Builder { switch inst.opcode { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - + case .condBranchSamePosition: + payload = .init(addr: addr, position: inst.payload.position) case .branch, .save, .saveAddress, .clearThrough: payload = .init(addr: addr) @@ -281,6 +295,7 @@ extension MEProgram.Builder { regInfo.sequences = sequences.count regInfo.ints = nextIntRegister.rawValue regInfo.values = nextValueRegister.rawValue + regInfo.positions = nextPositionRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count regInfo.assertionFunctions = assertionFunctions.count @@ -421,6 +436,12 @@ extension MEProgram.Builder { return r } + mutating func makePositionRegister() -> PositionRegister { + let r = nextPositionRegister + defer { nextPositionRegister.rawValue += 1 } + return r + } + // TODO: A register-mapping helper struct, which could release // registers without monotonicity required diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index f7b3a65a2..d19da01e5 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -245,7 +245,7 @@ extension Processor { } mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters) = + guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure else { state = .fail @@ -259,6 +259,7 @@ extension Processor { callStack.removeLast(callStack.count - stackEnd.rawValue) storedCaptures = capEnds registers.ints = intRegisters + registers.positions = posRegisters } mutating func abort(_ e: Error? = nil) { @@ -315,7 +316,10 @@ extension Processor { registers[reg] = int controller.step() - + case .moveCurrentPosition: + let reg = payload.position + registers[reg] = currentPosition + controller.step() case .branch: controller.pc = payload.addr @@ -327,7 +331,13 @@ extension Processor { registers[int] -= 1 controller.step() } - + case .condBranchSamePosition: + let (addr, pos) = payload.pairedAddrPos + if registers[pos] == currentPosition { + controller.pc = addr + } else { + controller.step() + } case .save: let resumeAddr = payload.addr let sp = makeSavePoint(resumeAddr) diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..e5d33af8b 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -47,6 +47,8 @@ extension Processor { var ints: [Int] var values: [Any] + + var positions: [Input.Index] } } @@ -66,6 +68,12 @@ extension Processor.Registers { values[i.rawValue] = newValue } } + subscript(_ i: PositionRegister) -> Input.Index { + get { positions[i.rawValue] } + set { + positions[i.rawValue] = newValue + } + } subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } @@ -89,6 +97,8 @@ extension Processor.Registers { } extension Processor.Registers { + static let sentinelIndex = "".startIndex + init( _ program: MEProgram, _ sentinel: String.Index @@ -120,11 +130,15 @@ extension Processor.Registers { self.values = Array( repeating: SentinelValue(), count: info.values) + self.positions = Array( + repeating: Processor.Registers.sentinelIndex, + count: info.positions) } mutating func reset(sentinel: Input.Index) { self.ints._setAll(to: 0) self.values._setAll(to: SentinelValue()) + self.positions._setAll(to: Processor.Registers.sentinelIndex) } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 4e64f7335..712808184 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -208,4 +208,40 @@ extension RegexTests { expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) } + + func testQuantificationForwardProgressCompile() { + // Unbounded quantification + non forward progressing inner nodes + // Expect to emit the position checking instructions + expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + + // Bounded quantification, don't emit position checking + expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + + // Inner node is a quantification that does not guarantee forward progress + expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + // Inner node is a quantification that guarantees forward progress + expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c9f5107e8..a1bce20fd 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1895,5 +1895,31 @@ extension RegexTests { XCTAssertEqual(matches.count, 3) } } -} + func expectCompletion(regex: String, in target: String) { + let expectation = XCTestExpectation(description: "Run the given regex to completion") + Task.init { + let r = try! Regex(regex) + let val = target.matches(of: r).isEmpty + expectation.fulfill() + return val + } + wait(for: [expectation], timeout: 3.0) + } + + func testQuantificationForwardProgress() { + expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa") + expectCompletion(regex: #"(?:\b)+"#, in: "aa") + expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa") + expectCompletion(regex: #"(a*)*"#, in: "aa") + expectCompletion(regex: #"(a?)*"#, in: "aa") + expectCompletion(regex: #"(a{,4})*"#, in: "aa") + expectCompletion(regex: #"((|)+)*"#, in: "aa") + } +}