Skip to content
32 changes: 23 additions & 9 deletions Sources/RegexBuilder/Anchor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,30 @@ public struct Anchor {

@available(SwiftStdlib 5.7, *)
extension Anchor: RegexComponent {
var baseAssertion: DSLTree._AST.AssertionKind {
var baseAssertion: DSLTree.Atom.Assertion {
switch kind {
case .startOfSubject: return .startOfSubject(isInverted)
case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted)
case .endOfSubject: return .endOfSubject(isInverted)
case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted)
case .textSegmentBoundary: return .textSegmentBoundary(isInverted)
case .startOfLine: return .startOfLine(isInverted)
case .endOfLine: return .endOfLine(isInverted)
case .wordBoundary: return .wordBoundary(isInverted)
case .startOfSubject:
// FIXME: Inverted?
return .startOfSubject
case .endOfSubjectBeforeNewline:
// FIXME: Inverted?
return .endOfSubjectBeforeNewline
case .endOfSubject:
// FIXME: Inverted?
return .endOfSubject
case .firstMatchingPositionInSubject:
// FIXME: Inverted?
return .firstMatchingPositionInSubject
case .textSegmentBoundary:
return isInverted ? .notTextSegment : .textSegment
case .startOfLine:
// FIXME: Inverted?
return .startOfLine
case .endOfLine:
// FIXME: Inverted?
return .endOfLine
case .wordBoundary:
return isInverted ? .notWordBoundary : .wordBoundary
}
}

Expand Down
4 changes: 4 additions & 0 deletions Sources/RegexBuilder/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ extension RegexComponent where Self == CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
}

public static var anyNonNewline: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)]))
}

public static var anyGraphemeCluster: CharacterClass {
.init(unconverted: ._anyGrapheme)
}
Expand Down
83 changes: 11 additions & 72 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ extension AST {
case namedCharacter(String)

/// .
case any
case dot

/// ^
case startOfLine
case caretAnchor

/// $
case endOfLine
case dollarAnchor

// References
case backreference(Reference)
Expand Down Expand Up @@ -104,9 +104,9 @@ extension AST.Atom {
case .callout(let v): return v
case .backtrackingDirective(let v): return v
case .changeMatchingOptions(let v): return v
case .any: return nil
case .startOfLine: return nil
case .endOfLine: return nil
case .dot: return nil
case .caretAnchor: return nil
case .dollarAnchor: return nil
case .invalid: return nil
}
}
Expand Down Expand Up @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty {
}
}

extension AST.Atom {
/// Anchors and other built-in zero-width assertions.
public enum AssertionKind: String, Hashable {
/// \A
case startOfSubject = #"\A"#

/// \Z
case endOfSubjectBeforeNewline = #"\Z"#

/// \z
case endOfSubject = #"\z"#

/// \K
case resetStartOfMatch = #"\K"#

/// \G
case firstMatchingPositionInSubject = #"\G"#

/// \y
case textSegment = #"\y"#

/// \Y
case notTextSegment = #"\Y"#

/// ^
case startOfLine = #"^"#

/// $
case endOfLine = #"$"#

/// \b (from outside a custom character class)
case wordBoundary = #"\b"#

/// \B
case notWordBoundary = #"\B"#

}

public var assertionKind: AssertionKind? {
switch kind {
case .startOfLine: return .startOfLine
case .endOfLine: return .endOfLine

case .escaped(.wordBoundary): return .wordBoundary
case .escaped(.notWordBoundary): return .notWordBoundary
case .escaped(.startOfSubject): return .startOfSubject
case .escaped(.endOfSubject): return .endOfSubject
case .escaped(.textSegment): return .textSegment
case .escaped(.notTextSegment): return .notTextSegment
case .escaped(.endOfSubjectBeforeNewline):
return .endOfSubjectBeforeNewline
case .escaped(.firstMatchingPositionInSubject):
return .firstMatchingPositionInSubject

case .escaped(.resetStartOfMatch): return .resetStartOfMatch

default: return nil
}
}
}

extension AST.Atom {
public enum Callout: Hashable {
/// A PCRE callout written `(?C...)`
Expand Down Expand Up @@ -806,9 +745,9 @@ extension AST.Atom {
// the AST? Or defer for the matching engine?
return nil

case .scalarSequence, .property, .any, .startOfLine, .endOfLine,
.backreference, .subpattern, .callout, .backtrackingDirective,
.changeMatchingOptions, .invalid:
case .scalarSequence, .property, .dot, .caretAnchor,
.dollarAnchor, .backreference, .subpattern, .callout,
.backtrackingDirective, .changeMatchingOptions, .invalid:
return nil
}
}
Expand Down Expand Up @@ -858,7 +797,7 @@ extension AST.Atom {
case .keyboardMetaControl(let x):
return "\\M-\\C-\(x)"

case .property, .escaped, .any, .startOfLine, .endOfLine,
case .property, .escaped, .dot, .caretAnchor, .dollarAnchor,
.backreference, .subpattern, .namedCharacter, .callout,
.backtrackingDirective, .changeMatchingOptions, .invalid:
return nil
Expand All @@ -874,7 +813,7 @@ extension AST.Atom {
// TODO: Are callouts quantifiable?
case .escaped(let esc):
return esc.isQuantifiable
case .startOfLine, .endOfLine:
case .caretAnchor, .dollarAnchor:
return false
default:
return true
Expand Down
6 changes: 3 additions & 3 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2073,9 +2073,9 @@ extension Parser {
p.unreachable("Should have lexed a group or group-like atom")

// (sometimes) special metacharacters
case ".": return customCC ? .char(".") : .any
case "^": return customCC ? .char("^") : .startOfLine
case "$": return customCC ? .char("$") : .endOfLine
case ".": return customCC ? .char(".") : .dot
case "^": return customCC ? .char("^") : .caretAnchor
case "$": return customCC ? .char("$") : .dollarAnchor

// Escaped
case "\\": return p.expectEscaped().value
Expand Down
4 changes: 2 additions & 2 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ extension RegexValidator {
) {
switch esc {
case .resetStartOfMatch, .singleDataUnit, .trueAnychar,
// '\N' needs to be emitted using 'emitAny'.
// '\N' needs to be emitted using 'emitDot'.
.notNewline:
error(.unsupported("'\\\(esc.character)'"), at: loc)

Expand Down Expand Up @@ -288,7 +288,7 @@ extension RegexValidator {
at: atom.location)
}

case .char, .scalar, .startOfLine, .endOfLine, .any:
case .char, .scalar, .caretAnchor, .dollarAnchor, .dot:
break

case .invalid:
Expand Down
6 changes: 3 additions & 3 deletions Sources/_RegexParser/Regex/Printing/DumpAST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,9 @@ extension AST.Atom {
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
fatalError("TODO")

case .any: return "."
case .startOfLine: return "^"
case .endOfLine: return "$"
case .dot: return "."
case .caretAnchor: return "^"
case .dollarAnchor: return "$"

case .backreference(let r), .subpattern(let r):
return "\(r._dumpBase)"
Expand Down
7 changes: 4 additions & 3 deletions Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,6 @@ extension AST.Atom.Number {

extension AST.Atom {
var _canonicalBase: String {
if let anchor = self.assertionKind {
return anchor.rawValue
}
if let lit = self.literalStringValue {
// FIXME: We may have to re-introduce escapes
// For example, `\.` will come back as "." instead
Expand All @@ -248,6 +245,10 @@ extension AST.Atom {
return lit
}
switch self.kind {
case .caretAnchor:
return "^"
case .dollarAnchor:
return "$"
case .escaped(let e):
return "\\\(e.character)"
case .backreference(let br):
Expand Down
Loading