diff --git a/.gitignore b/.gitignore index a7e7e4d09..ff85b9fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .DS_Store +# The current toolchain is dumping files in the package root, rude +*.emit-module.* + # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 7656526a6..5f526f963 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -3,6 +3,7 @@ ## Regex Type and Overview +- [Second review](https://forums.swift.org/t/se-0350-second-review-regex-type-and-overview/56886) - [Proposal](https://github.com/apple/swift-evolution/blob/main/proposals/0350-regex-type-overview.md), [Thread](https://forums.swift.org/t/se-0350-regex-type-and-overview/56530) - [Pitch thread](https://forums.swift.org/t/pitch-regex-type-and-overview/56029) diff --git a/Package.swift b/Package.swift index f8162e762..f9eb95e8e 100644 --- a/Package.swift +++ b/Package.swift @@ -10,6 +10,13 @@ let availabilityDefinition = PackageDescription.SwiftSetting.unsafeFlags([ #"SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999"#, ]) +let stdlibSettings: [PackageDescription.SwiftSetting] = [ + .unsafeFlags(["-enable-library-evolution"]), + .unsafeFlags(["-Xfrontend", "-disable-implicit-concurrency-module-import"]), + .unsafeFlags(["-Xfrontend", "-disable-implicit-string-processing-module-import"]), + availabilityDefinition +] + let package = Package( name: "swift-experimental-string-processing", products: [ @@ -36,10 +43,7 @@ let package = Package( .target( name: "_RegexParser", dependencies: [], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .testTarget( name: "MatchingEngineTests", dependencies: [ @@ -51,29 +55,21 @@ let package = Package( .target( name: "_StringProcessing", dependencies: ["_RegexParser", "_CUnicode"], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .target( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .testTarget( name: "RegexTests", dependencies: ["_StringProcessing"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) + .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", dependencies: ["_StringProcessing", "RegexBuilder"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), .testTarget( @@ -102,7 +98,6 @@ let package = Package( name: "Exercises", dependencies: ["_RegexParser", "_StringProcessing", "RegexBuilder"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), .testTarget( diff --git a/README.md b/README.md index 42586ad2b..67c708a75 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ See [Declarative String Processing Overview][decl-string] ## Requirements -- [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-03-09 or later. +- [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-04-20 or later. ## Trying it out diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index a10698526..497d54506 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, + regex, .semantic, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index e8cd4ac54..ae66310af 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -12,6 +12,12 @@ @_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing +/// A regex component that matches a specific condition at a particular position +/// in an input string. +/// +/// You can use anchors to guarantee that a match only occurs at certain points +/// in an input string, such as at the beginning of the string or at the end of +/// a line. @available(SwiftStdlib 5.7, *) public struct Anchor { internal enum Kind { @@ -53,14 +59,24 @@ extension Anchor: RegexComponent { @available(SwiftStdlib 5.7, *) extension Anchor { + /// An anchor that matches at the start of the input string. + /// + /// This anchor is equivalent to `\A` in regex syntax. public static var startOfSubject: Anchor { Anchor(kind: .startOfSubject) } - + + /// An anchor that matches at the end of the input string or at the end of + /// the line immediately before the the end of the string. + /// + /// This anchor is equivalent to `\Z` in regex syntax. public static var endOfSubjectBeforeNewline: Anchor { Anchor(kind: .endOfSubjectBeforeNewline) } - + + /// An anchor that matches at the end of the input string. + /// + /// This anchor is equivalent to `\z` in regex syntax. public static var endOfSubject: Anchor { Anchor(kind: .endOfSubject) } @@ -70,26 +86,53 @@ extension Anchor { // Anchor(kind: resetStartOfMatch) // } + /// An anchor that matches at the first position of a match in the input + /// string. public static var firstMatchingPositionInSubject: Anchor { Anchor(kind: .firstMatchingPositionInSubject) } + /// An anchor that matches at a grapheme cluster boundary. + /// + /// This anchor is equivalent to `\y` in regex syntax. public static var textSegmentBoundary: Anchor { Anchor(kind: .textSegmentBoundary) } + /// An anchor that matches at the start of a line, including the start of + /// the input string. + /// + /// This anchor is equivalent to `^` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } + /// An anchor that matches at the end of a line, including at the end of + /// the input string. + /// + /// This anchor is equivalent to `$` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } + /// An anchor that matches at a word boundary. + /// + /// Word boundaries are identified using the Unicode default word boundary + /// algorithm by default. To specify a different word boundary algorithm, + /// see the `RegexComponent.wordBoundaryKind(_:)` method. + /// + /// This anchor is equivalent to `\b` in regex syntax. public static var wordBoundary: Anchor { Anchor(kind: .wordBoundary) } + /// The inverse of this anchor, which matches at every position that this + /// anchor does not. + /// + /// For the `wordBoundary` and `textSegmentBoundary` anchors, the inverted + /// version corresponds to `\B` and `\Y`, respectively. public var inverted: Anchor { var result = self result.isInverted.toggle() @@ -97,6 +140,13 @@ extension Anchor { } } +/// A regex component that allows a match to continue only if its contents +/// match at the given location. +/// +/// A lookahead is a zero-length assertion that its included regex matches at +/// a particular position. Lookaheads do not advance the overall matching +/// position in the input string — once a lookahead succeeds, matching continues +/// in the regex from the same position. @available(SwiftStdlib 5.7, *) public struct Lookahead: _BuiltinRegexComponent { public var regex: Regex @@ -105,19 +155,48 @@ public struct Lookahead: _BuiltinRegexComponent { self.regex = regex } + /// Creates a lookahead from the given regex component. public init( - _ component: R, - negative: Bool = false + _ component: R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component.regex.root)) + self.init(node: .nonCapturingGroup(.lookahead, component.regex.root)) } + + /// Creates a lookahead from the regex generated by the given builder closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.lookahead, component().regex.root)) + } +} +/// A regex component that allows a match to continue only if its contents +/// do not match at the given location. +/// +/// A negative lookahead is a zero-length assertion that its included regex +/// does not match at a particular position. Lookaheads do not advance the +/// overall matching position in the input string — once a lookahead succeeds, +/// matching continues in the regex from the same position. +@available(SwiftStdlib 5.7, *) +public struct NegativeLookahead: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a negative lookahead from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) + } + + /// Creates a negative lookahead from the regex generated by the given builder + /// closure. public init( - negative: Bool = false, @RegexComponentBuilder _ component: () -> R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component().regex.root)) + self.init(node: .nonCapturingGroup(.negativeLookahead, component().regex.root)) } } diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index a7dcd2015..be1548b72 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -125,7 +125,9 @@ extension AST.Node { switch self { case .atom(let a): return a.isQuantifiable - case .group, .conditional, .customCharacterClass, .absentFunction: + case .group(let g): + return g.isQuantifiable + case .conditional, .customCharacterClass, .absentFunction: return true case .alternation, .concatenation, .quantification, .quote, .trivia, .empty: diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index e17ce68bb..19e2fb498 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -29,7 +29,13 @@ extension AST { /// A Unicode scalar value written as a literal /// /// \u{...}, \0dd, \x{...}, ... - case scalar(Unicode.Scalar) + case scalar(Scalar) + + /// A whitespace-separated sequence of Unicode scalar values which are + /// implicitly splatted out. + /// + /// `\u{A B C}` -> `\u{A}\u{B}\u{C}` + case scalarSequence(ScalarSequence) /// A Unicode property, category, or script, including those written using /// POSIX syntax. @@ -84,6 +90,7 @@ extension AST.Atom { switch kind { case .char(let v): return v case .scalar(let v): return v + case .scalarSequence(let v): return v case .property(let v): return v case .escaped(let v): return v case .keyboardControl(let v): return v @@ -106,6 +113,30 @@ extension AST.Atom { } } +extension AST.Atom { + public struct Scalar: Hashable { + public var value: UnicodeScalar + public var location: SourceLocation + + public init(_ value: UnicodeScalar, _ location: SourceLocation) { + self.value = value + self.location = location + } + } + + public struct ScalarSequence: Hashable { + public var scalars: [Scalar] + public var trivia: [AST.Trivia] + + public init(_ scalars: [Scalar], trivia: [AST.Trivia]) { + precondition(scalars.count > 1, "Expected multiple scalars") + self.scalars = scalars + self.trivia = trivia + } + public var scalarValues: [Unicode.Scalar] { scalars.map(\.value) } + } +} + extension AST.Atom { // TODO: We might scrap this and break out a few categories so @@ -396,6 +427,9 @@ extension AST.Atom.CharacterProperty { case script(Unicode.Script) case scriptExtension(Unicode.Script) + /// Character name in the form `\p{name=...}` + case named(String) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. @@ -665,6 +699,23 @@ extension AST.Atom.EscapedBuiltin { return nil } } + + public var isQuantifiable: Bool { + switch self { + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, + .singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace, + .notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace, + .notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter, + .notWordCharacter, .backspace, .graphemeCluster, .trueAnychar: + return true + + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment, + .notTextSegment: + return false + } + } } extension AST.Atom { @@ -677,7 +728,7 @@ extension AST.Atom { case .char(let c): return c case .scalar(let s): - return Character(s) + return Character(s.value) case .escaped(let c): return c.scalarValue.map(Character.init) @@ -693,8 +744,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective, .changeMatchingOptions: + case .scalarSequence, .property, .any, .startOfLine, .endOfLine, + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: return nil } } @@ -716,13 +768,21 @@ extension AST.Atom { /// A string literal representation of the atom, if possible. /// /// Individual characters are returned as-is, and Unicode scalars are - /// presented using "\u{nnnn}" syntax. + /// presented using "\u{nn nn ...}" syntax. public var literalStringValue: String? { + func scalarLiteral(_ u: [UnicodeScalar]) -> String { + let digits = u.map { String($0.value, radix: 16, uppercase: true) } + .joined(separator: " ") + return "\\u{\(digits)}" + } switch kind { case .char(let c): return String(c) case .scalar(let s): - return "\\u{\(String(s.value, radix: 16, uppercase: true))}" + return scalarLiteral([s.value]) + + case .scalarSequence(let s): + return scalarLiteral(s.scalarValues) case .keyboardControl(let x): return "\\C-\(x)" @@ -746,6 +806,10 @@ extension AST.Atom { case .changeMatchingOptions: return false // TODO: Are callouts quantifiable? + case .escaped(let esc): + return esc.isQuantifiable + case .startOfLine, .endOfLine: + return false default: return true } diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 8ecaadeda..6fd46abe7 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -136,3 +136,18 @@ extension AST.Group { } } } + +extension AST.Group { + var isQuantifiable: Bool { + switch kind.value { + case .capture, .namedCapture, .balancedCapture, .nonCapture, + .nonCaptureReset, .atomicNonCapturing, .scriptRun, .atomicScriptRun, + .changeMatchingOptions: + return true + + case .lookahead, .negativeLookahead, .nonAtomicLookahead, + .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + return false + } + } +} diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index e779c39fb..d3dbc1666 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -17,7 +17,7 @@ extension AST { case caseInsensitive // i case allowDuplicateGroupNames // J case multiline // m - case noAutoCapture // n + case namedCapturesOnly // n case singleLine // s case reluctantByDefault // U case extended // x diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index d112b2010..0287e7337 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -26,15 +26,18 @@ extension CaptureList { public var name: String? public var type: Any.Type? public var optionalDepth: Int + public var location: SourceLocation public init( name: String? = nil, type: Any.Type? = nil, - optionalDepth: Int + optionalDepth: Int, + _ location: SourceLocation ) { self.name = name self.type = type self.optionalDepth = optionalDepth + self.location = location } } } @@ -61,13 +64,14 @@ extension AST.Node { case let .group(g): switch g.kind.value { case .capture: - list.append(.init(optionalDepth: nesting)) + list.append(.init(optionalDepth: nesting, g.location)) case .namedCapture(let name): - list.append(.init(name: name.value, optionalDepth: nesting)) + list.append(.init(name: name.value, optionalDepth: nesting, g.location)) case .balancedCapture(let b): - list.append(.init(name: b.name?.value, optionalDepth: nesting)) + list.append(.init(name: b.name?.value, optionalDepth: nesting, + g.location)) default: break } @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable { public static func == (lhs: Self, rhs: Self) -> Bool { lhs.name == rhs.name && lhs.optionalDepth == rhs.optionalDepth && - lhs.type == rhs.type + lhs.type == rhs.type && + lhs.location == rhs.location } } extension CaptureList: Equatable {} diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 911312121..c0ece78ff 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -18,7 +18,7 @@ extension Source { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. - let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" } + let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() if let m = match(str) { return m @@ -32,8 +32,8 @@ extension Source { static private func classifyGeneralCategory( _ str: String ) -> Unicode.ExtendedGeneralCategory? { - // This uses the aliases defined in - // https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // This uses the aliases defined in https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // Additionally, uses the `L& = Lc` alias defined by PCRE. withNormalizedForms(str) { str in switch str { case "c", "other": return .other @@ -43,7 +43,7 @@ extension Source { case "co", "privateuse": return .privateUse case "cs", "surrogate": return .surrogate case "l", "letter": return .letter - case "lc", "casedletter": return .casedLetter + case "lc", "l&", "casedletter": return .casedLetter case "ll", "lowercaseletter": return .lowercaseLetter case "lm", "modifierletter": return .modifierLetter case "lo", "otherletter": return .otherLetter @@ -428,6 +428,8 @@ extension Source { if let cat = classifyGeneralCategory(value) { return .generalCategory(cat) } + case "name", "na": + return .named(value) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 0856361d8..4ae518dcd 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input) + let ast = try parseWithDelimiters(input, .semantic) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index c3d74c30b..d87fba918 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable { // TODO: I wonder if it makes sense to store the string. // This can make equality weird. + // MARK: Syntactic Errors + case numberOverflow(String) case expectedNumDigits(String, Int) case expectedNumber(String, kind: RadixKind) @@ -43,7 +45,6 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern - case notQuantifiable case quantifierRequiresOperand(String) case backtrackingDirectiveMustHaveName(String) @@ -55,7 +56,6 @@ enum ParseError: Error, Hashable { case cannotRemoveMatchingOptionsAfterCaret case expectedCustomCharacterClassMembers - case invalidCharacterClassRangeOperand case emptyProperty case unknownProperty(key: String?, value: String) @@ -73,6 +73,17 @@ enum ParseError: Error, Hashable { case cannotRemoveExtendedSyntaxInMultilineMode case expectedCalloutArgument + + // MARK: Semantic Errors + + case unsupported(String) + case deprecatedUnicode(String) + case invalidReference(Int) + case duplicateNamedCapture(String) + case invalidCharacterClassRangeOperand + case invalidQuantifierRange(Int, Int) + case invalidCharacterRange(from: Character, to: Character) + case notQuantifiable } extension IdentifierKind { @@ -88,18 +99,23 @@ extension IdentifierKind { extension ParseError: CustomStringConvertible { var description: String { switch self { + // MARK: Syntactic Errors case let .numberOverflow(s): return "number overflow: \(s)" case let .expectedNumDigits(s, i): return "expected \(i) digits in '\(s)'" case let .expectedNumber(s, kind: kind): - let radix: String - if kind == .decimal { - radix = "" - } else { - radix = " of radix \(kind.radix)" + let number: String + switch kind { + case .octal: + number = "octal number" + case .decimal: + number = "number" + case .hex: + number = "hexadecimal number" } - return "expected a numbers in '\(s)'\(radix)" + let suffix = s.isEmpty ? "" : " in '\(s)'" + return "expected \(number)\(suffix)" case let .expected(s): return "expected '\(s)'" case .unexpectedEndOfInput: @@ -114,8 +130,6 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" - case .notQuantifiable: - return "expression is not quantifiable" case .quantifierRequiresOperand(let q): return "quantifier '\(q)' must appear after expression" case .backtrackingDirectiveMustHaveName(let b): @@ -167,6 +181,23 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + + // MARK: Semantic Errors + + case let .unsupported(kind): + return "\(kind) is not currently supported" + case let .deprecatedUnicode(kind): + return "\(kind) is a deprecated Unicode property, and is not supported" + case let .invalidReference(i): + return "no capture numbered \(i)" + case let .duplicateNamedCapture(str): + return "group named '\(str)' already exists" + case let .invalidQuantifierRange(lhs, rhs): + return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" + case let .invalidCharacterRange(from: lhs, to: rhs): + return "character '\(lhs)' must compare less than or equal to '\(rhs)'" + case .notQuantifiable: + return "expression is not quantifiable" } } } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 9633b607e..e8783dc86 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -157,6 +157,19 @@ extension Source { return .init(start ..< currentPosition) } + /// Attempt to eat a given prefix that satisfies a given predicate, with the + /// source location recorded. + mutating func tryEatLocatedPrefix( + maxLength: Int? = nil, + _ f: (Char) -> Bool + ) -> Located? { + let result = recordLoc { src in + src.tryEatPrefix(maxLength: maxLength, f) + } + guard let result = result else { return nil } + return result.map(\.string) + } + /// Throws an expected ASCII character error if not matched mutating func expectASCII() throws -> Located { try recordLoc { src in @@ -217,13 +230,13 @@ extension Source { /// return the scalar value, or throw an error if the string is malformed or /// would overflow the scalar. private static func validateUnicodeScalar( - _ str: String, _ kind: RadixKind - ) throws -> Unicode.Scalar { - let num = try validateNumber(str, UInt32.self, kind) + _ str: Source.Located, _ kind: RadixKind + ) throws -> AST.Atom.Scalar { + let num = try validateNumber(str.value, UInt32.self, kind) guard let scalar = Unicode.Scalar(num) else { throw ParseError.misc("Invalid scalar value U+\(num.hexStr)") } - return scalar + return .init(scalar, str.location) } /// Try to eat a number of a particular type and radix off the front. @@ -266,20 +279,65 @@ extension Source { /// Eat a scalar value from hexadecimal notation off the front private mutating func expectUnicodeScalar( numDigits: Int - ) throws -> Located { - try recordLoc { src in + ) throws -> AST.Atom.Scalar { + let str = try recordLoc { src -> String in let str = src.eat(upToCount: numDigits).string guard str.count == numDigits else { throw ParseError.expectedNumDigits(str, numDigits) } - return try Source.validateUnicodeScalar(str, .hex) + return str } + return try Source.validateUnicodeScalar(str, .hex) + } + + /// Try to lex a seqence of hex digit unicode scalars. + /// + /// UniScalarSequence -> Whitespace? UniScalarSequencElt+ + /// UniScalarSequencElt -> HexDigit{1...} Whitespace? + /// + mutating func expectUnicodeScalarSequence( + eating ending: Character + ) throws -> AST.Atom.Kind { + try recordLoc { src in + var scalars = [AST.Atom.Scalar]() + var trivia = [AST.Trivia]() + + // Eat up any leading whitespace. + if let t = src.lexWhitespace() { trivia.append(t) } + + while true { + let str = src.lexUntil { src in + // Hit the ending, stop lexing. + if src.isEmpty || src.peek() == ending { + return true + } + // Eat up trailing whitespace, and stop lexing to record the scalar. + if let t = src.lexWhitespace() { + trivia.append(t) + return true + } + // Not the ending or trivia, must be a digit of the scalar. + return false + } + guard !str.value.isEmpty else { break } + scalars.append(try Source.validateUnicodeScalar(str, .hex)) + } + guard !scalars.isEmpty else { + throw ParseError.expectedNumber("", kind: .hex) + } + try src.expect(ending) + + if scalars.count == 1 { + return .scalar(scalars[0]) + } + return .scalarSequence(.init(scalars, trivia: trivia)) + }.value } /// Eat a scalar off the front, starting from after the /// backslash and base character (e.g. `\u` or `\x`). /// - /// UniScalar -> 'u{' HexDigit{1...} '}' + /// UniScalar -> 'u{' UniScalarSequence '}' /// | 'u' HexDigit{4} /// | 'x{' HexDigit{1...} '}' /// | 'x' HexDigit{0...2} @@ -289,49 +347,60 @@ extension Source { /// mutating func expectUnicodeScalar( escapedCharacter base: Character - ) throws -> Located { + ) throws -> AST.Atom.Kind { try recordLoc { src in + + func nullScalar() -> AST.Atom.Kind { + let pos = src.currentPosition + return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) + } + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. switch base { // Hex numbers. - case "u" where src.tryEat("{"), "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .hex) + case "u" where src.tryEat("{"): + return try src.expectUnicodeScalarSequence(eating: "}") + + case "x" where src.tryEat("{"): + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .hex)) case "x": // \x expects *up to* 2 digits. - guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else { + guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) + else { // In PCRE, \x without any valid hex digits is \u{0}. // TODO: This doesn't appear to be followed by ICU or Oniguruma, so // could be changed to throw an error if we had a parsing mode for // them. - return Unicode.Scalar(0) + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .hex) + return .scalar(try Source.validateUnicodeScalar(digits, .hex)) case "u": - return try src.expectUnicodeScalar(numDigits: 4).value + return .scalar(try src.expectUnicodeScalar(numDigits: 4)) case "U": - return try src.expectUnicodeScalar(numDigits: 8).value + return .scalar(try src.expectUnicodeScalar(numDigits: 8)) // Octal numbers. case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .octal) + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .octal)) case "0": // We can read *up to* 3 more octal digits. // FIXME: PCRE can only read up to 2 octal digits, if we get a strict // PCRE mode, we should limit it here. - guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { - return Unicode.Scalar(0) + guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) + else { + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .octal) + return .scalar(try Source.validateUnicodeScalar(digits, .octal)) default: fatalError("Unexpected scalar start") } - } + }.value } /// Try to consume a quantifier @@ -434,13 +503,22 @@ extension Source { private mutating func lexUntil( _ predicate: (inout Source) throws -> Bool ) rethrows -> Located { + // We track locations outside of recordLoc, as the predicate may advance the + // input when we hit the end, and we don't want that to affect the location + // of what was lexed in the `result`. We still want the recordLoc call to + // attach locations to any thrown errors though. + // TODO: We should find a better way of doing this, `lexUntil` seems full + // of footguns. + let start = currentPosition + var end = currentPosition + var result = "" try recordLoc { src in - var result = "" while try !predicate(&src) { result.append(src.eat()) + end = src.currentPosition } - return result } + return .init(result, start ..< end) } private mutating func lexUntil(eating end: String) throws -> Located { @@ -576,6 +654,16 @@ extension Source { // inside a custom character class (and only treats whitespace as // non-semantic there for the extra-extended `(?xx)` mode). If we get a // strict-PCRE mode, we'll need to add a case for that. + return lexWhitespace() + } + + /// Try to consume whitespace as trivia + /// + /// Whitespace -> WhitespaceChar+ + /// + /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex + /// whitespace. + mutating func lexWhitespace() -> AST.Trivia? { let trivia: Located? = recordLoc { src in src.tryEatPrefix(\.isPatternWhitespace)?.string } @@ -616,7 +704,7 @@ extension Source { case "i": return advanceAndReturn(.caseInsensitive) case "J": return advanceAndReturn(.allowDuplicateGroupNames) case "m": return advanceAndReturn(.multiline) - case "n": return advanceAndReturn(.noAutoCapture) + case "n": return advanceAndReturn(.namedCapturesOnly) case "s": return advanceAndReturn(.singleLine) case "U": return advanceAndReturn(.reluctantByDefault) case "x": @@ -914,6 +1002,10 @@ extension Source { } // TODO: (name:) + // If (?n) is set, a bare (...) group is non-capturing. + if context.syntax.contains(.namedCapturesOnly) { + return .nonCapture + } return .capture } } @@ -1149,7 +1241,7 @@ extension Source { // We should either have a unicode scalar. if src.tryEat(sequence: "U+") { - let str = try src.lexUntil(eating: "}").value + let str = try src.lexUntil(eating: "}") return .scalar(try Source.validateUnicodeScalar(str, .hex)) } @@ -1577,8 +1669,7 @@ extension Source { switch char { // Hexadecimal and octal unicode scalars. case "u", "x", "U", "o", "0": - return try .scalar( - src.expectUnicodeScalar(escapedCharacter: char).value) + return try src.expectUnicodeScalar(escapedCharacter: char) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index ec6e1c26c..112f32358 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -227,9 +227,6 @@ extension Parser { if let (amt, kind, trivia) = try source.lexQuantifier(context: context) { let location = loc(_start) - guard operand.isQuantifiable else { - throw Source.LocatedError(ParseError.notQuantifiable, location) - } result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) } else { @@ -287,23 +284,34 @@ extension Parser { private mutating func applySyntaxOptions( of opts: AST.MatchingOptionSequence ) { - // We skip this for multi-line, as extended syntax is always enabled there. - if context.syntax.contains(.multilineExtendedSyntax) { return } + func mapOption(_ option: SyntaxOptions, + _ pred: (AST.MatchingOption) -> Bool) { + if opts.resetsCurrentOptions { + context.syntax.remove(option) + } + if opts.adding.contains(where: pred) { + context.syntax.insert(option) + } + if opts.removing.contains(where: pred) { + context.syntax.remove(option) + } + } + func mapOption(_ option: SyntaxOptions, _ kind: AST.MatchingOption.Kind) { + mapOption(option, { $0.kind == kind }) + } - // Check if we're introducing or removing extended syntax. + // (?n) + mapOption(.namedCapturesOnly, .namedCapturesOnly) + + // (?x), (?xx) + // We skip this for multi-line, as extended syntax is always enabled there. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. - if opts.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if opts.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if opts.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) + if !context.syntax.contains(.multilineExtendedSyntax) { + mapOption(.extendedSyntax, \.isAnyExtended) } } @@ -532,11 +540,6 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.isValidCharacterClassRangeBound && - rhs.isValidCharacterClassRangeBound else { - throw ParseError.invalidCharacterClassRangeOperand - } - // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } @@ -547,13 +550,31 @@ extension Parser { } } +public enum ASTStage { + /// The regex is parsed, and a syntactically valid AST is returned. Otherwise + /// an error is thrown. This is useful for e.g syntax coloring. + case syntactic + + /// The regex is parsed, and a syntactically and semantically valid AST is + /// returned. Otherwise an error is thrown. A semantically valid AST has been + /// checked for e.g unsupported constructs and invalid backreferences. + case semantic +} + public func parse( - _ regex: S, _ syntax: SyntaxOptions + _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - return try parser.parse() + let ast = try parser.parse() + switch stage { + case .syntactic: + break + case .semantic: + try validate(ast) + } + return ast } /// Retrieve the default set of syntax options that a delimiter and literal @@ -580,11 +601,12 @@ fileprivate func defaultSyntaxOptions( /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S + _ regex: S, _ stage: ASTStage ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) do { - return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return try parse(contents, stage, syntax) } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift new file mode 100644 index 000000000..9d5ae4576 --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -0,0 +1,407 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// Validate a regex AST for semantic validity. Once bytecode is emitted at +/// compile time, this could potentially be subsumed by the bytecode generator. +fileprivate struct RegexValidator { + let ast: AST + let captures: CaptureList + + init(_ ast: AST) { + self.ast = ast + self.captures = ast.captureList + } + + func error(_ kind: ParseError, at loc: SourceLocation) -> Error { + Source.LocatedError(kind, loc) + } +} + +extension String { + fileprivate var quoted: String { "'\(self)'" } +} + +extension RegexValidator { + func validate() throws { + for opt in ast.globalOptions?.options ?? [] { + try validateGlobalMatchingOption(opt) + } + try validateCaptures() + try validateNode(ast.root) + } + + func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + switch opt.kind { + case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, + .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, + .unicodeProperties: + // These are PCRE specific, and not something we're likely to ever + // support. + throw error(.unsupported("global matching option"), at: opt.location) + + case .newlineMatching: + // We have implemented the correct behavior for multi-line literals, but + // these should also affect '.' and '\N' matching, which we haven't + // implemented. + throw error(.unsupported("newline matching mode"), at: opt.location) + + case .newlineSequenceMatching: + // We haven't yet implemented the '\R' matching specifics of these. + throw error( + .unsupported("newline sequence matching mode"), at: opt.location) + } + } + + func validateCaptures() throws { + // TODO: Should this be validated when creating the capture list? + var usedNames = Set() + for capture in captures.captures { + guard let name = capture.name else { continue } + guard usedNames.insert(name).inserted else { + throw error(.duplicateNamedCapture(name), at: capture.location) + } + } + } + + func validateReference(_ ref: AST.Reference) throws { + switch ref.kind { + case .absolute(let i): + guard i <= captures.captures.count else { + throw error(.invalidReference(i), at: ref.innerLoc) + } + case .relative: + throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + case .named: + // TODO: This could be implemented by querying the capture list for an + // index. + throw error(.unsupported("named capture reference"), at: ref.innerLoc) + } + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } + } + + func validateMatchingOption(_ opt: AST.MatchingOption) throws { + let loc = opt.location + switch opt.kind { + case .allowDuplicateGroupNames: + // Not currently supported as we need to figure out what to do with + // the capture type. + throw error(.unsupported("duplicate group naming"), at: loc) + + case .unicodeWordBoundaries: + throw error(.unsupported("unicode word boundary mode"), at: loc) + + case .textSegmentWordMode, .textSegmentGraphemeMode: + throw error(.unsupported("text segment mode"), at: loc) + + case .byteSemantics: + throw error(.unsupported("byte semantic mode"), at: loc) + + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, + .unicodeScalarSemantics, .graphemeClusterSemantics, + .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, + .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: + break + } + } + + func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + for opt in opts.adding { + try validateMatchingOption(opt) + } + for opt in opts.removing { + try validateMatchingOption(opt) + } + } + + func validateBinaryProperty( + _ prop: Unicode.BinaryProperty, at loc: SourceLocation + ) throws { + switch prop { + case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, + .changesWhenCasefolded, .changesWhenCasemapped, + .changesWhenNFKCCasefolded, .changesWhenLowercased, + .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, + .defaultIgnorableCodePoint, .diacratic, .extender, + .fullCompositionExclusion, .graphemeBase, .graphemeExtended, .hexDigit, + .idContinue, .ideographic, .idStart, .idsBinaryOperator, + .idsTrinaryOperator, .joinControl, .logicalOrderException, .lowercase, + .math, .noncharacterCodePoint, .patternSyntax, .patternWhitespace, + .quotationMark, .radical, .regionalIndicator, .softDotted, + .sentenceTerminal, .terminalPunctuation, .unifiedIdiograph, .uppercase, + .variationSelector, .whitespace, .xidContinue, .xidStart: + break + + case .emojiModifierBase, .emojiModifier, .emoji, .emojiPresentation: + // These are available on macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1. + // TODO: We should ideally check deployment target for such conditionally + // available properties. + break + + case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: + throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + + case .bidiControl, .compositionExclusion, .emojiComponent, + .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, + .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, + .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, + .otherUppercase, .prependedConcatenationMark: + throw error(.unsupported(prop.rawValue.quoted), at: loc) + } + } + + func validateCharacterProperty( + _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation + ) throws { + // TODO: We could re-add the .other case to diagnose unknown properties + // here instead of in the parser. + // TODO: Should we store an 'inner location' for the contents of `\p{...}`? + switch prop.kind { + case .binary(let b, _): + try validateBinaryProperty(b, at: loc) + case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, + .scriptExtension: + break + case .pcreSpecial: + throw error(.unsupported("PCRE property"), at: loc) + case .onigurumaSpecial: + throw error(.unsupported("Unicode block property"), at: loc) + } + } + + func validateEscaped( + _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation + ) throws { + switch esc { + case .resetStartOfMatch, .singleDataUnit, + // '\N' needs to be emitted using 'emitAny'. + .notNewline: + throw error(.unsupported("'\\\(esc.character)'"), at: loc) + + // Character classes. + case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, + .horizontalWhitespace, .notHorizontalWhitespace, + .verticalTab, .notVerticalTab: + break + + case .newlineSequence: + break + + // Assertions. + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, .textSegment, + .notTextSegment, .firstMatchingPositionInSubject: + break + + // Literal escapes. + case .alarm, .backspace, .escape, .formfeed, .newline, .carriageReturn, + .tab: + break + } + } + + func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws { + switch atom.kind { + case .escaped(let esc): + try validateEscaped(esc, at: atom.location) + + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: + // We need to implement the scalar computations for these. + throw error(.unsupported("control sequence"), at: atom.location) + + case .property(let p): + try validateCharacterProperty(p, at: atom.location) + + case .backreference(let r): + try validateReference(r) + + case .subpattern: + throw error(.unsupported("subpattern"), at: atom.location) + + case .callout: + // These are PCRE and Oniguruma specific, supporting them is future work. + throw error(.unsupported("callout"), at: atom.location) + + case .backtrackingDirective: + // These are PCRE-specific, and are unlikely to be fully supported. + throw error(.unsupported("backtracking directive"), at: atom.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + + case .namedCharacter: + // TODO: We should error on unknown Unicode scalar names. + break + + case .scalarSequence: + // Not currently supported in a custom character class. + if inCustomCharacterClass { + throw error(.unsupported("scalar sequence in custom character class"), + at: atom.location) + } + + case .char, .scalar, .startOfLine, .endOfLine, .any: + break + } + } + + func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + for member in c.members { + try validateCharacterClassMember(member) + } + } + + func validateCharacterClassRange( + _ range: AST.CustomCharacterClass.Range + ) throws { + let lhs = range.lhs + let rhs = range.rhs + + try validateAtom(lhs, inCustomCharacterClass: true) + try validateAtom(rhs, inCustomCharacterClass: true) + + guard lhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + } + guard rhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + } + + guard let lhsChar = lhs.literalCharacterValue else { + throw error( + .unsupported("character class range operand"), at: lhs.location) + } + + guard let rhsChar = rhs.literalCharacterValue else { + throw error( + .unsupported("character class range operand"), at: rhs.location) + } + + guard lhsChar <= rhsChar else { + throw error( + .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) + } + } + + func validateCharacterClassMember( + _ member: AST.CustomCharacterClass.Member + ) throws { + switch member { + case .custom(let c): + try validateCustomCharacterClass(c) + + case .range(let r): + try validateCharacterClassRange(r) + + case .atom(let a): + try validateAtom(a, inCustomCharacterClass: true) + + case .setOperation(let lhs, _, let rhs): + for lh in lhs { try validateCharacterClassMember(lh) } + for rh in rhs { try validateCharacterClassMember(rh) } + + case .quote, .trivia: + break + } + } + + func validateGroup(_ group: AST.Group) throws { + let kind = group.kind + switch kind.value { + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + break + + case .balancedCapture: + // These are .NET specific, and kinda niche. + throw error(.unsupported("balanced capture"), at: kind.location) + + case .nonCaptureReset: + // We need to figure out how these interact with typed captures. + throw error(.unsupported("branch reset group"), at: kind.location) + + case .atomicNonCapturing: + throw error(.unsupported("atomic group"), at: kind.location) + + case .nonAtomicLookahead: + throw error(.unsupported("non-atomic lookahead"), at: kind.location) + + case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + throw error(.unsupported("lookbehind"), at: kind.location) + + case .scriptRun, .atomicScriptRun: + throw error(.unsupported("script run"), at: kind.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + } + try validateNode(group.child) + } + + func validateQuantification(_ quant: AST.Quantification) throws { + try validateNode(quant.child) + guard quant.child.isQuantifiable else { + throw error(.notQuantifiable, at: quant.child.location) + } + switch quant.amount.value { + case .range(let lhs, let rhs): + guard lhs.value <= rhs.value else { + throw error( + .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + } + case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: + break + } + } + + func validateNode(_ node: AST.Node) throws { + switch node { + case .alternation(let a): + for branch in a.children { + try validateNode(branch) + } + case .concatenation(let c): + for child in c.children { + try validateNode(child) + } + + case .group(let g): + try validateGroup(g) + + case .conditional(let c): + // Note even once we get runtime support for this, we need to change the + // parsing to incorporate what is specified in the syntax proposal. + throw error(.unsupported("conditional"), at: c.location) + + case .quantification(let q): + try validateQuantification(q) + + case .atom(let a): + try validateAtom(a, inCustomCharacterClass: false) + + case .customCharacterClass(let c): + try validateCustomCharacterClass(c) + + case .absentFunction(let a): + // These are Oniguruma specific. + throw error(.unsupported("absent function"), at: a.location) + + case .quote, .trivia, .empty: + break + } + } +} + +/// Check a regex AST for semantic validity. +public func validate(_ ast: AST) throws { + try RegexValidator(ast).validate() +} diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index 0a6270f1b..dbfe5f2d6 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -63,6 +63,9 @@ public struct SyntaxOptions: OptionSet { return [Self(1 << 6), .extendedSyntax] } + /// `(?n)` + public static var namedCapturesOnly: Self { Self(1 << 7) } + /* /// `*` == `[[:digit:]]*` == `\d*` diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index a9cf6b424..b8937d518 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -138,6 +138,9 @@ extension AST.Atom { switch kind { case .escaped(let c): return "\\\(c.character)" + case .scalarSequence(let s): + return s.scalars.map(\.value.halfWidthCornerQuoted).joined() + case .namedCharacter(let charName): return "\\N{\(charName)}" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 21fcfa703..d30cab209 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen { } case .endOfSubjectBeforeNewline: - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in if pos == input.endIndex { return true } - return input.index(after: pos) == input.endIndex - && input[pos].isNewline + switch semanticLevel { + case .graphemeCluster: + return input.index(after: pos) == input.endIndex + && input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: pos) == input.endIndex + && input.unicodeScalars[pos].isNewline + } } case .endOfSubject: @@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen { case .startOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex || input[input.index(before: pos)].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.startIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in @@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen { case .endOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex || input[pos].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.endIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in @@ -168,7 +186,15 @@ extension Compiler.ByteCodeGen { } mutating func emitCharacter(_ c: Character) throws { - // FIXME: Does semantic level matter? + // Unicode scalar matches the specific scalars that comprise a character + if options.semanticLevel == .unicodeScalar { + print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") + for scalar in c.unicodeScalars { + try emitScalar(scalar) + } + return + } + if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in @@ -625,22 +651,44 @@ extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - // TODO: Should this incorporate options? - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() + if options.semanticLevel == .graphemeCluster { + if options.isCaseInsensitive { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } else { + builder.buildMatchSequence(s) + } + } else { + builder.buildConsume { + [caseInsensitive = options.isCaseInsensitive] input, bounds in + // TODO: Case folding + var iterator = s.unicodeScalars.makeIterator() var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) + while let scalar = iterator.next() { + guard currentIndex < bounds.upperBound else { return nil } + if caseInsensitive { + if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { + return nil + } + } else { + if scalar != input.unicodeScalars[currentIndex] { + return nil + } + } + input.unicodeScalars.formIndex(after: ¤tIndex) } return currentIndex } - } else { - builder.buildMatchSequence(s) } case let .regexLiteral(l): diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 47faa23ed..1c20761c8 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,7 +38,7 @@ class Compiler { func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { - let ast = try parse(regex, syntax) + let ast = try parse(regex, .semantic, syntax) let program = try Compiler(ast: ast).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 356b7cc4b..48f353e52 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,6 +111,51 @@ extension DSLTree.Atom { } } +extension String { + /// Compares this string to `other` using the loose matching rule UAX44-LM2, + /// which ignores case, whitespace, underscores, and nearly all medial + /// hyphens. + /// + /// FIXME: Only ignore medial hyphens + /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E + /// See https://www.unicode.org/reports/tr44/#Matching_Rules + fileprivate func isEqualByUAX44LM2(to other: String) -> Bool { + var index = startIndex + var otherIndex = other.startIndex + + while index < endIndex && otherIndex < other.endIndex { + if self[index].isWhitespace || self[index] == "-" || self[index] == "_" { + formIndex(after: &index) + continue + } + if other[otherIndex].isWhitespace || other[otherIndex] == "-" || other[otherIndex] == "_" { + other.formIndex(after: &otherIndex) + continue + } + + if self[index] != other[otherIndex] && self[index].lowercased() != other[otherIndex].lowercased() { + return false + } + + formIndex(after: &index) + other.formIndex(after: &otherIndex) + } + return index == endIndex && otherIndex == other.endIndex + } +} + +func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { + let consume = opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar + : consumeScalar + + return consume(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) +} // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves @@ -131,6 +176,13 @@ extension AST.Atom { } } + var singleScalar: UnicodeScalar? { + switch kind { + case .scalar(let s): return s.value + default: return nil + } + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { @@ -148,7 +200,7 @@ extension AST.Atom { case let .scalar(s): assertionFailure( "Should have been handled by tree conversion") - return consumeScalar { $0 == s } + return consumeScalar { $0 == s.value } case let .char(c): assertionFailure( @@ -167,10 +219,7 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalarProp { - // TODO: alias? casing? - $0.name == name || $0.nameAlias == name - } + return consumeName(name, opts: opts) case .any: assertionFailure( @@ -181,9 +230,9 @@ extension AST.Atom { // handled in emitAssertion return nil - case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions: + case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + .keyboardMetaControl, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions: // FIXME: implement return nil } @@ -312,8 +361,9 @@ extension DSLTree.CustomCharacterClass { } } if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } return nil } @@ -321,38 +371,26 @@ extension DSLTree.CustomCharacterClass { } // NOTE: Conveniences, though not most performant -private func consumeScalarScript( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - Unicode.Script($0) == s - } +typealias ScalarPredicate = (UnicodeScalar) -> Bool + +private func scriptScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script($0) == s } } -private func consumeScalarScriptExtension( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - let extensions = Unicode.Script.extensions(for: $0) - return extensions.contains(s) - } +private func scriptExtensionScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script.extensions(for: $0).contains(s) } } -private func consumeScalarGC( - _ gc: Unicode.GeneralCategory -) -> MEProgram.ConsumeFunction { - consumeScalar { gc == $0.properties.generalCategory } +private func categoryScalarPredicate(_ gc: Unicode.GeneralCategory) -> ScalarPredicate { + { gc == $0.properties.generalCategory } } -private func consumeScalarGCs( - _ gcs: [Unicode.GeneralCategory] -) -> MEProgram.ConsumeFunction { - consumeScalar { gcs.contains($0.properties.generalCategory) } +private func categoriesScalarPredicate(_ gcs: [Unicode.GeneralCategory]) -> ScalarPredicate { + { gcs.contains($0.properties.generalCategory) } } -private func consumeScalarProp( - _ p: @escaping (Unicode.Scalar.Properties) -> Bool -) -> MEProgram.ConsumeFunction { - consumeScalar { p($0.properties) } +private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) -> Bool) -> ScalarPredicate { + { p($0.properties) } } + func consumeScalar( - _ p: @escaping (Unicode.Scalar) -> Bool + _ p: @escaping ScalarPredicate ) -> MEProgram.ConsumeFunction { { input, bounds in // TODO: bounds check? @@ -364,6 +402,37 @@ func consumeScalar( return nil } } +func consumeCharacterWithLeadingScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + if p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} +func consumeCharacterWithSingleScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + + if input[curIdx].hasExactlyOneScalar && p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} + +func consumeFunction( + for opts: MatchingOptions +) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { + opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithLeadingScalar + : consumeScalar +} extension AST.Atom.CharacterProperty { func generateConsumer( @@ -375,16 +444,15 @@ extension AST.Atom.CharacterProperty { ) -> MEProgram.ConsumeFunction { return { input, bounds in if p(input, bounds) != nil { return nil } - // TODO: semantic level + // TODO: bounds check - return input.unicodeScalars.index( - after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } } - // FIXME: Below is largely scalar based, for convenience, - // but we want a comprehensive treatment to semantic mode - // switching. + let consume = consumeFunction(for: opts) let preInversion: MEProgram.ConsumeFunction = try { switch kind { @@ -395,11 +463,16 @@ extension AST.Atom.CharacterProperty { return input.index(after: bounds.lowerBound) } case .assigned: - return consumeScalar { + return consume { $0.properties.generalCategory != .unassigned } case .ascii: - return consumeScalar(\.isASCII) + // Note: ASCII must look at the whole character, not just the first + // scalar. That is, "e\u{301}" is not an ASCII character, even though + // the first scalar is. + return opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar(\.isASCII) + : consumeScalar(\.isASCII) case .generalCategory(let p): return try p.generateConsumer(opts) @@ -410,10 +483,13 @@ extension AST.Atom.CharacterProperty { return value ? cons : invert(cons) case .script(let s): - return consumeScalarScript(s) + return consume(scriptScalarPredicate(s)) case .scriptExtension(let s): - return consumeScalarScriptExtension(s) + return consume(scriptExtensionScalarPredicate(s)) + + case .named(let n): + return consumeName(n, opts: opts) case .posix(let p): return p.generateConsumer(opts) @@ -436,49 +512,51 @@ extension Unicode.BinaryProperty { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { - switch self { + let consume = consumeFunction(for: opts) + // Note if you implement support for any of the below, you need to adjust + // the switch in Sema.swift to not have it be diagnosed as unsupported + // (potentially guarded on deployment version). + switch self { case .asciiHexDigit: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isHexDigit && $0.isASCIIHexDigit - } + }) case .alphabetic: - return consumeScalarProp(\.isAlphabetic) + return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: break - - - case .bidiMirrored: - return consumeScalarProp(\.isBidiMirrored) + case .bidiMirrored: + return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: - return consumeScalarProp(\.isCased) + return consume(propertyScalarPredicate(\.isCased)) case .compositionExclusion: break case .caseIgnorable: - return consumeScalarProp(\.isCaseIgnorable) + return consume(propertyScalarPredicate(\.isCaseIgnorable)) case .changesWhenCasefolded: - return consumeScalarProp(\.changesWhenCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenCaseFolded)) case .changesWhenCasemapped: - return consumeScalarProp(\.changesWhenCaseMapped) + return consume(propertyScalarPredicate(\.changesWhenCaseMapped)) case .changesWhenNFKCCasefolded: - return consumeScalarProp(\.changesWhenNFKCCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenNFKCCaseFolded)) case .changesWhenLowercased: - return consumeScalarProp(\.changesWhenLowercased) + return consume(propertyScalarPredicate(\.changesWhenLowercased)) case .changesWhenTitlecased: - return consumeScalarProp(\.changesWhenTitlecased) + return consume(propertyScalarPredicate(\.changesWhenTitlecased)) case .changesWhenUppercased: - return consumeScalarProp(\.changesWhenUppercased) + return consume(propertyScalarPredicate(\.changesWhenUppercased)) case .dash: - return consumeScalarProp(\.isDash) + return consume(propertyScalarPredicate(\.isDash)) case .deprecated: - return consumeScalarProp(\.isDeprecated) + return consume(propertyScalarPredicate(\.isDeprecated)) case .defaultIgnorableCodePoint: - return consumeScalarProp(\.isDefaultIgnorableCodePoint) + return consume(propertyScalarPredicate(\.isDefaultIgnorableCodePoint)) case .diacratic: // spelling? - return consumeScalarProp(\.isDiacritic) + return consume(propertyScalarPredicate(\.isDiacritic)) case .emojiModifierBase: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifierBase) + return consume(propertyScalarPredicate(\.isEmojiModifierBase)) } else { throw Unsupported( "isEmojiModifierBase on old OSes") @@ -487,59 +565,59 @@ extension Unicode.BinaryProperty { break case .emojiModifier: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifier) + return consume(propertyScalarPredicate(\.isEmojiModifier)) } else { throw Unsupported("isEmojiModifier on old OSes") } case .emoji: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmoji) + return consume(propertyScalarPredicate(\.isEmoji)) } else { throw Unsupported("isEmoji on old OSes") } case .emojiPresentation: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiPresentation) + return consume(propertyScalarPredicate(\.isEmojiPresentation)) } else { throw Unsupported( "isEmojiPresentation on old OSes") } case .extender: - return consumeScalarProp(\.isExtender) + return consume(propertyScalarPredicate(\.isExtender)) case .extendedPictographic: break // NOTE: Stdlib has this data internally case .fullCompositionExclusion: - return consumeScalarProp(\.isFullCompositionExclusion) + return consume(propertyScalarPredicate(\.isFullCompositionExclusion)) case .graphemeBase: - return consumeScalarProp(\.isGraphemeBase) + return consume(propertyScalarPredicate(\.isGraphemeBase)) case .graphemeExtended: - return consumeScalarProp(\.isGraphemeExtend) + return consume(propertyScalarPredicate(\.isGraphemeExtend)) case .graphemeLink: break case .hexDigit: - return consumeScalarProp(\.isHexDigit) + return consume(propertyScalarPredicate(\.isHexDigit)) case .hyphen: break case .idContinue: - return consumeScalarProp(\.isIDContinue) + return consume(propertyScalarPredicate(\.isIDContinue)) case .ideographic: - return consumeScalarProp(\.isIdeographic) + return consume(propertyScalarPredicate(\.isIdeographic)) case .idStart: - return consumeScalarProp(\.isIDStart) + return consume(propertyScalarPredicate(\.isIDStart)) case .idsBinaryOperator: - return consumeScalarProp(\.isIDSBinaryOperator) + return consume(propertyScalarPredicate(\.isIDSBinaryOperator)) case .idsTrinaryOperator: - return consumeScalarProp(\.isIDSTrinaryOperator) + return consume(propertyScalarPredicate(\.isIDSTrinaryOperator)) case .joinControl: - return consumeScalarProp(\.isJoinControl) + return consume(propertyScalarPredicate(\.isJoinControl)) case .logicalOrderException: - return consumeScalarProp(\.isLogicalOrderException) + return consume(propertyScalarPredicate(\.isLogicalOrderException)) case .lowercase: - return consumeScalarProp(\.isLowercase) + return consume(propertyScalarPredicate(\.isLowercase)) case .math: - return consumeScalarProp(\.isMath) + return consume(propertyScalarPredicate(\.isMath)) case .noncharacterCodePoint: - return consumeScalarProp(\.isNoncharacterCodePoint) + return consume(propertyScalarPredicate(\.isNoncharacterCodePoint)) case .otherAlphabetic: break case .otherDefaultIgnorableCodePoint: @@ -557,37 +635,37 @@ extension Unicode.BinaryProperty { case .otherUppercase: break case .patternSyntax: - return consumeScalarProp(\.isPatternSyntax) + return consume(propertyScalarPredicate(\.isPatternSyntax)) case .patternWhitespace: - return consumeScalarProp(\.isPatternWhitespace) + return consume(propertyScalarPredicate(\.isPatternWhitespace)) case .prependedConcatenationMark: break case .quotationMark: - return consumeScalarProp(\.isQuotationMark) + return consume(propertyScalarPredicate(\.isQuotationMark)) case .radical: - return consumeScalarProp(\.isRadical) + return consume(propertyScalarPredicate(\.isRadical)) case .regionalIndicator: - return consumeScalar { s in + return consume { s in (0x1F1E6...0x1F1FF).contains(s.value) } case .softDotted: - return consumeScalarProp(\.isSoftDotted) + return consume(propertyScalarPredicate(\.isSoftDotted)) case .sentenceTerminal: - return consumeScalarProp(\.isSentenceTerminal) + return consume(propertyScalarPredicate(\.isSentenceTerminal)) case .terminalPunctuation: - return consumeScalarProp(\.isTerminalPunctuation) + return consume(propertyScalarPredicate(\.isTerminalPunctuation)) case .unifiedIdiograph: // spelling? - return consumeScalarProp(\.isUnifiedIdeograph) + return consume(propertyScalarPredicate(\.isUnifiedIdeograph)) case .uppercase: - return consumeScalarProp(\.isUppercase) + return consume(propertyScalarPredicate(\.isUppercase)) case .variationSelector: - return consumeScalarProp(\.isVariationSelector) + return consume(propertyScalarPredicate(\.isVariationSelector)) case .whitespace: - return consumeScalarProp(\.isWhitespace) + return consume(propertyScalarPredicate(\.isWhitespace)) case .xidContinue: - return consumeScalarProp(\.isXIDContinue) + return consume(propertyScalarPredicate(\.isXIDContinue)) case .xidStart: - return consumeScalarProp(\.isXIDStart) + return consume(propertyScalarPredicate(\.isXIDStart)) case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw Unsupported("Unicode-deprecated: \(self)") @@ -602,42 +680,44 @@ extension Unicode.POSIXProperty { func generateConsumer( _ opts: MatchingOptions ) -> MEProgram.ConsumeFunction { - // FIXME: semantic levels, modes, etc + let consume = consumeFunction(for: opts) + + // FIXME: modes, etc switch self { case .alnum: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isAlphabetic || $0.numericType != nil - } + }) case .blank: - return consumeScalar { s in + return consume { s in s.properties.generalCategory == .spaceSeparator || s == "\t" } case .graph: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in !( p.isWhitespace || p.generalCategory == .control || p.generalCategory == .surrogate || p.generalCategory == .unassigned ) - } + }) case .print: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.generalCategory != .control - } + }) case .word: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.isAlphabetic || p.numericType != nil || p.isJoinControl || p.isDash// marks and connectors... - } + }) case .xdigit: - return consumeScalarProp(\.isHexDigit) // or number + return consume(propertyScalarPredicate(\.isHexDigit)) // or number } } @@ -648,112 +728,115 @@ extension Unicode.ExtendedGeneralCategory { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { case .letter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter, .modifierLetter, .otherLetter - ]) + ])) case .mark: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .nonspacingMark, .spacingMark, .enclosingMark - ]) + ])) case .number: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .decimalNumber, .letterNumber, .otherNumber - ]) + ])) case .symbol: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol - ]) + ])) case .punctuation: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .connectorPunctuation, .dashPunctuation, .openPunctuation, .closePunctuation, .initialPunctuation, .finalPunctuation, .otherPunctuation - ]) + ])) case .separator: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .spaceSeparator, .lineSeparator, .paragraphSeparator - ]) + ])) case .other: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .control, .format, .surrogate, .privateUse, .unassigned - ]) + ])) case .casedLetter: - throw Unsupported( - "TODO: cased letter? not the property?") + return consume(categoriesScalarPredicate([ + .uppercaseLetter, .lowercaseLetter, .titlecaseLetter + ])) case .control: - return consumeScalarGC(.control) + return consume(categoryScalarPredicate(.control)) case .format: - return consumeScalarGC(.format) + return consume(categoryScalarPredicate(.format)) case .unassigned: - return consumeScalarGC(.unassigned) + return consume(categoryScalarPredicate(.unassigned)) case .privateUse: - return consumeScalarGC(.privateUse) + return consume(categoryScalarPredicate(.privateUse)) case .surrogate: - return consumeScalarGC(.surrogate) + return consume(categoryScalarPredicate(.surrogate)) case .lowercaseLetter: - return consumeScalarGC(.lowercaseLetter) + return consume(categoryScalarPredicate(.lowercaseLetter)) case .modifierLetter: - return consumeScalarGC(.modifierLetter) + return consume(categoryScalarPredicate(.modifierLetter)) case .otherLetter: - return consumeScalarGC(.otherLetter) + return consume(categoryScalarPredicate(.otherLetter)) case .titlecaseLetter: - return consumeScalarGC(.titlecaseLetter) + return consume(categoryScalarPredicate(.titlecaseLetter)) case .uppercaseLetter: - return consumeScalarGC(.uppercaseLetter) + return consume(categoryScalarPredicate(.uppercaseLetter)) case .spacingMark: - return consumeScalarGC(.spacingMark) + return consume(categoryScalarPredicate(.spacingMark)) case .enclosingMark: - return consumeScalarGC(.enclosingMark) + return consume(categoryScalarPredicate(.enclosingMark)) case .nonspacingMark: - return consumeScalarGC(.nonspacingMark) + return consume(categoryScalarPredicate(.nonspacingMark)) case .decimalNumber: - return consumeScalarGC(.decimalNumber) + return consume(categoryScalarPredicate(.decimalNumber)) case .letterNumber: - return consumeScalarGC(.letterNumber) + return consume(categoryScalarPredicate(.letterNumber)) case .otherNumber: - return consumeScalarGC(.otherNumber) + return consume(categoryScalarPredicate(.otherNumber)) case .connectorPunctuation: - return consumeScalarGC(.connectorPunctuation) + return consume(categoryScalarPredicate(.connectorPunctuation)) case .dashPunctuation: - return consumeScalarGC(.dashPunctuation) + return consume(categoryScalarPredicate(.dashPunctuation)) case .closePunctuation: - return consumeScalarGC(.closePunctuation) + return consume(categoryScalarPredicate(.closePunctuation)) case .finalPunctuation: - return consumeScalarGC(.finalPunctuation) + return consume(categoryScalarPredicate(.finalPunctuation)) case .initialPunctuation: - return consumeScalarGC(.initialPunctuation) + return consume(categoryScalarPredicate(.initialPunctuation)) case .otherPunctuation: - return consumeScalarGC(.otherPunctuation) + return consume(categoryScalarPredicate(.otherPunctuation)) case .openPunctuation: - return consumeScalarGC(.openPunctuation) + return consume(categoryScalarPredicate(.openPunctuation)) case .currencySymbol: - return consumeScalarGC(.currencySymbol) + return consume(categoryScalarPredicate(.currencySymbol)) case .modifierSymbol: - return consumeScalarGC(.modifierSymbol) + return consume(categoryScalarPredicate(.modifierSymbol)) case .mathSymbol: - return consumeScalarGC(.mathSymbol) + return consume(categoryScalarPredicate(.mathSymbol)) case .otherSymbol: - return consumeScalarGC(.otherSymbol) + return consume(categoryScalarPredicate(.otherSymbol)) case .lineSeparator: - return consumeScalarGC(.lineSeparator) + return consume(categoryScalarPredicate(.lineSeparator)) case .paragraphSeparator: - return consumeScalarGC(.paragraphSeparator) + return consume(categoryScalarPredicate(.paragraphSeparator)) case .spaceSeparator: - return consumeScalarGC(.spaceSeparator) + return consume(categoryScalarPredicate(.spaceSeparator)) } } } diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 665715a60..f5c554bdc 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -117,7 +117,6 @@ extension MatchingOptions { // Deprecated CharacterClass.MatchLevel API extension MatchingOptions { - @available(*, deprecated) var matchLevel: _CharacterClassModel.MatchLevel { switch semanticLevel { case .graphemeCluster: @@ -135,7 +134,7 @@ extension MatchingOptions { case caseInsensitive case allowDuplicateGroupNames case multiline - case noAutoCapture + case namedCapturesOnly case singleLine case reluctantByDefault @@ -174,8 +173,8 @@ extension MatchingOptions { self = .allowDuplicateGroupNames case .multiline: self = .multiline - case .noAutoCapture: - self = .noAutoCapture + case .namedCapturesOnly: + self = .namedCapturesOnly case .singleLine: self = .singleLine case .reluctantByDefault: diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1b5c2a4c5..601447968 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -671,13 +671,19 @@ extension AST.Atom { } var _dslBase: String { + func scalarLiteral(_ s: UnicodeScalar) -> String { + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}" + } switch kind { case let .char(c): return String(c) case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" + return scalarLiteral(s.value) + + case let .scalarSequence(seq): + return seq.scalarValues.map(scalarLiteral).joined() case let .property(p): return p._dslBase @@ -769,13 +775,9 @@ extension AST.Atom { var _regexBase: String { switch kind { - case let .char(c): - return String(c) - - case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - + case .char, .scalar, .scalarSequence: + return literalStringValue! + case let .property(p): return p._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index ef98a7b8f..79a515033 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -60,18 +60,24 @@ extension AST.Node { var result = "" var idx = idx while idx < astChildren.endIndex { - let atom: AST.Atom? = astChildren[idx].as() + guard let atom: AST.Atom = astChildren[idx].as() else { break } // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - guard let char = atom?.singleCharacter else { + if let char = atom.singleCharacter { + result.append(char) + } else if let scalar = atom.singleScalar { + result.append(Character(scalar)) + } else if case .scalarSequence(let seq) = atom.kind { + result += seq.scalarValues.map(Character.init) + } else { break } - result.append(char) + astChildren.formIndex(after: &idx) } - return result.count <= 1 ? nil : (idx, result) + return result.isEmpty ? nil : (idx, result) } // No need to nest single children concatenations @@ -96,7 +102,7 @@ extension AST.Node { curIdx = nextIdx } else { children.append(astChildren[curIdx].dslTreeNode) - children.formIndex(after: &curIdx) + astChildren.formIndex(after: &curIdx) } } return .concatenation(children) @@ -132,7 +138,15 @@ extension AST.Node { return .trivia(v.contents) case let .atom(v): - return .atom(v.dslTreeAtom) + switch v.kind { + case .scalarSequence(let seq): + // Scalar sequences are splatted into concatenated scalars, which + // becomes a quoted literal. Sequences nested in concatenations have + // already been coalesced, this just handles the lone atom case. + return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + default: + return .atom(v.dslTreeAtom) + } case let .customCharacterClass(ccc): return .customCharacterClass(ccc.dslTreeClass) @@ -207,7 +221,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) + case let .scalar(s): return .char(Character(s.value)) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 23222da00..6dd8e17b6 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -17,7 +17,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } @@ -31,7 +31,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } @@ -62,6 +62,7 @@ public struct AnyRegexOutput { /// The depth of `Optioals`s wrapping the underlying value. For example, /// `Substring` has optional depth `0`, and `Int??` has optional depth `2`. let optionalDepth: Int + /// The bounds of the output element. let bounds: Range? } @@ -90,7 +91,7 @@ extension AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: The output, if the underlying value can be converted to the /// output type; otherwise `nil`. - public func `as`(_ type: Output.Type) -> Output? { + public func `as`(_ type: Output.Type = Output.self) -> Output? { let elements = _elements.map { StructuredCapture( optionalCount: $0.optionalDepth, @@ -206,23 +207,30 @@ extension Regex.Match where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A match generic over the output type, if the underlying values /// can be converted to the output type; otherwise, `nil`. - public func `as`(_ type: Output.Type) -> Regex.Match? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex.Match? { fatalError("FIXME: Not implemented") } } @available(SwiftStdlib 5.7, *) -extension Regex where Output == AnyRegexOutput { +extension Regex { /// Returns whether a named-capture with `name` exists public func contains(captureNamed name: String) -> Bool { - fatalError("FIXME: not implemented") + program.tree.root._captureList.captures.contains(where: { + $0.name == name + }) } +} +@available(SwiftStdlib 5.7, *) +extension Regex where Output == AnyRegexOutput { /// Creates a type-erased regex from an existing regex. /// /// Use this initializer to fit a regex with strongly typed captures into the /// use site of a dynamic regex, i.e. one that was created from a string. - public init(_ match: Regex) { + public init(_ regex: Regex) { fatalError("FIXME: Not implemented") } @@ -231,7 +239,9 @@ extension Regex where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A regex generic over the output type if the underlying types can be converted. /// Returns `nil` otherwise. - public func `as`(_ type: Output.Type) -> Regex? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex? { fatalError("FIXME: Not implemented") } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 1f9a35dad..29d2267b2 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -44,7 +44,7 @@ public struct Regex: RegexComponent { // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .traditional)) + self.init(ast: try! parse(pattern, .semantic, .traditional)) } // Compiler interface. Do not change independently. @@ -53,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern)) + self.init(ast: try! parseWithDelimiters(pattern, .semantic)) } public var regex: Regex { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b279c08e4..ff057f2ee 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -472,7 +472,7 @@ extension DSLTree.Node { list.append(.init( name: name, type: child.valueCaptureType?.base, - optionalDepth: nesting)) + optionalDepth: nesting, .fake)) child._addCaptures(to: &list, optionalNesting: nesting) case let .nonCapturingGroup(kind, child): diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index cfa68c425..80f6819a6 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -12,3 +12,9 @@ // TODO +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + var hasExactlyOneScalar: Bool { + unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex + } +} diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift index 52a870357..0894fa572 100644 --- a/Sources/_StringProcessing/Unicode/ScalarProps.swift +++ b/Sources/_StringProcessing/Unicode/ScalarProps.swift @@ -46,3 +46,19 @@ extension Unicode.Script { return result } } + +extension UnicodeScalar { + var isHorizontalWhitespace: Bool { + value == 0x09 || properties.generalCategory == .spaceSeparator + } + + var isNewline: Bool { + switch value { + case 0x000A...0x000D /* LF ... CR */: return true + case 0x0085 /* NEXT LINE (NEL) */: return true + case 0x2028 /* LINE SEPARATOR */: return true + case 0x2029 /* PARAGRAPH SEPARATOR */: return true + default: return false + } + } +} diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 51d4f8bfc..78477e2b5 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -338,10 +338,26 @@ func escaped( atom(.escaped(e)) } func scalar(_ s: Unicode.Scalar) -> AST.Node { - atom(.scalar(s)) + .atom(scalar_a(s)) +} +func scalar_a(_ s: Unicode.Scalar) -> AST.Atom { + atom_a(.scalar(.init(s, .fake))) } func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member { - atom_m(.scalar(s)) + .atom(scalar_a(s)) +} + +func scalarSeq(_ s: Unicode.Scalar...) -> AST.Node { + .atom(scalarSeq_a(s)) +} +func scalarSeq_a(_ s: Unicode.Scalar...) -> AST.Atom { + scalarSeq_a(s) +} +func scalarSeq_a(_ s: [Unicode.Scalar]) -> AST.Atom { + atom_a(.scalarSequence(.init(s.map { .init($0, .fake) }, trivia: []))) +} +func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member { + .atom(scalarSeq_a(s)) } func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 4d0c12c1f..85dd1ca37 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable { matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !options.usesASCIISpaces) case .whitespace: matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): matched = set.any { $0.matches(c, with: options) } + case .custom(let set): + matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -194,28 +197,38 @@ public struct _CharacterClassModel: Hashable { return matched ? next : nil case .unicodeScalar: let c = str.unicodeScalars[i] + var nextIndex = str.unicodeScalars.index(after: i) var matched: Bool switch cc { case .any: matched = true case .anyScalar: matched = true - case .anyGrapheme: fatalError("Not matched in this mode") + case .anyGrapheme: + matched = true + nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: fatalError("Not implemented") - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { + str.unicodeScalars.formIndex(after: &nextIndex) + } case .whitespace: matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom: fatalError("Not supported") + case .custom(let set): + matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() } - return matched ? str.unicodeScalars.index(after: i) : nil + return matched ? nextIndex : nil } } } @@ -451,9 +464,13 @@ extension AST.Atom.EscapedBuiltin { case .notHorizontalWhitespace: return .horizontalWhitespace.inverted - case .notNewline: return .newlineSequence.inverted case .newlineSequence: return .newlineSequence + // FIXME: This is more like '.' than inverted '\R', as it is affected + // by e.g (*CR). We should therefore really be emitting it through + // emitAny(). For now we treat it as semantically invalid. + case .notNewline: return .newlineSequence.inverted + case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 4e08ea103..c0c6491ac 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -115,7 +115,7 @@ class RegexDSLTests: XCTestCase { { let disallowedChars = CharacterClass.hexDigit .symmetricDifference("a"..."z") - Lookahead(disallowedChars, negative: true) // No: 0-9 + g-z + NegativeLookahead(disallowedChars) // No: 0-9 + g-z OneOrMore(("b"..."g").union("d"..."n")) // b-n @@ -487,7 +487,7 @@ class RegexDSLTests: XCTestCase { { OneOrMore("a") Lookahead(CharacterClass.digit) - Lookahead("2", negative: true) + NegativeLookahead { "2" } CharacterClass.word } } @@ -742,43 +742,6 @@ class RegexDSLTests: XCTestCase { } } - func testDynamicCaptures() throws { - do { - let regex = try Regex("aabcc.") - let line = "aabccd" - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - } - do { - let regex = try Regex( - #""" - (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* - """#) - let line = """ - A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ - COMBINING MARK TUKWENTIS - """ - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - XCTAssertTrue(output[1].substring == "A6F0") - XCTAssertTrue(output["lower"]?.substring == "A6F0") - XCTAssertTrue(output[2].substring == "A6F1") - XCTAssertTrue(output["upper"]?.substring == "A6F1") - XCTAssertTrue(output[3].substring == "Extend") - XCTAssertTrue(output["desc"]?.substring == "Extend") - let typedOutput = try XCTUnwrap(output.as( - (Substring, lower: Substring, upper: Substring?, Substring).self)) - XCTAssertEqual(typedOutput.0, line[...]) - XCTAssertTrue(typedOutput.lower == "A6F0") - XCTAssertTrue(typedOutput.upper == "A6F1") - XCTAssertTrue(typedOutput.3 == "Extend") - } - } - func testBackreference() throws { try _testDSLCaptures( ("abc#41#42abcabcabc", ("abc#41#42abcabcabc", "abc", 42, "abc", nil)), @@ -889,6 +852,54 @@ class RegexDSLTests: XCTestCase { } } } + + // Post-hoc captured reference w/ attempted match before capture + // #"(?:\w\1|(\w):)+"# + // + // This tests that the reference `a` simply fails to match instead of + // erroring when encountered before a match is captured into `a`. The + // matching process here goes like this: + // - the first time through, the first alternation is taken + // - `.word` matches on "a" + // - the `a` backreference fails on ":", because `a` hasn't matched yet + // - backtrack to the beginning of the input + // - now the second alternation is taken + // - `.word` matches on "a" and is captured as `a` + // - the literal ":" matches + // - proceeding from the position of the first "b" in the first alternation + // - `.word` matches on "b" + // - the `a` backreference now contains "a", and matches on "a" + // - proceeding from the position of the first "c" in the first alternation + // - `.word` matches on "c" + // - the `a` backreference still contains "a", and matches on "a" + // - proceeding from the position of the first "o" in the first alternation + // - `.word` matches on "o" + // - the `a` backreference still contains "a", so it fails on ":" + // - now the second alternation is taken + // - `.word` matches on "o" and is captured as `a` + // - the literal ":" matches + // - continuing as above from the second "b"... + try _testDSLCaptures( + ("a:bacao:boco", ("a:bacao:boco", "o")), + matchType: (Substring, Substring?).self, + == + ) { + // NOTE: "expression too complex to type check" when inferring the generic + // parameter. + OneOrMore { + let a = Reference(Substring.self) + ChoiceOf<(Substring, Substring?)> { + Regex { + .word + a + } + Regex { + Capture(.word, as: a) + ":" + } + } + } + } } func testSemanticVersionExample() { diff --git a/Tests/RegexTests/AnyRegexOutputTests.swift b/Tests/RegexTests/AnyRegexOutputTests.swift new file mode 100644 index 000000000..8d91c0ec8 --- /dev/null +++ b/Tests/RegexTests/AnyRegexOutputTests.swift @@ -0,0 +1,157 @@ + +import _StringProcessing +import XCTest + +// Test that our existential capture and concrete captures are +// the same +private func checkSame( + _ aro: AnyRegexOutput, + _ concrete: (Substring, fieldA: Substring, fieldB: Substring) +) { + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + + XCTAssertEqual(aro[2].substring, concrete.2) + +} +private func checkSame( + _ aro: Regex.Match, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)>.Match +) { + checkSame(aro.output, concrete.output) + + XCTAssertEqual(aro.0, concrete.0) + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + XCTAssertEqual(aro[2].substring, concrete.2) +} +private func checkSame( + _ aro: Regex, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)> +) { + XCTAssertEqual( + aro.contains(captureNamed: "fieldA"), + concrete.contains(captureNamed: "fieldA")) + XCTAssertEqual( + aro.contains(captureNamed: "fieldB"), + concrete.contains(captureNamed: "fieldB")) + XCTAssertEqual( + aro.contains(captureNamed: "notAField"), + concrete.contains(captureNamed: "notAField")) +} + +extension RegexTests { + func testAnyRegexOutput() { + let regex = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + + let match = "abc,def".wholeMatch(of: regex)! + XCTAssertEqual(match.0, "abc,def") + XCTAssertEqual(match[0].substring, "abc,def") + + XCTAssertEqual(match["fieldA"]!.substring, "abc") + XCTAssertEqual(match.output["fieldA"]!.substring, "abc") + XCTAssertEqual(match[1].substring, "abc") + + XCTAssertEqual(match["fieldB"]!.substring, "def") + XCTAssertEqual(match.output["fieldB"]!.substring, "def") + XCTAssertEqual(match[2].substring, "def") + + XCTAssertNil(match["notACapture"]) + XCTAssertNil(match.output["notACapture"]) + XCTAssertEqual(match.count, 3) + + XCTAssert(regex.contains(captureNamed: "fieldA")) + XCTAssert(regex.contains(captureNamed: "fieldB")) + XCTAssertFalse(regex.contains(captureNamed: "notAField")) + + // MARK: Check equivalence with concrete + + let regexConcrete: + Regex<(Substring, fieldA: Substring, fieldB: Substring)> + = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + checkSame(regex, regexConcrete) + + let matchConcrete = "abc,def".wholeMatch(of: regexConcrete)! + checkSame(match, matchConcrete) + + let output = match.output + let concreteOutput = matchConcrete.output + checkSame(output, concreteOutput) + + // TODO: ARO init from concrete match tuple + + let concreteOutputCasted = output.as( + (Substring, fieldA: Substring, fieldB: Substring).self + )! + checkSame(output, concreteOutputCasted) + + var concreteOutputCopy = concreteOutput + concreteOutputCopy = output.as()! + checkSame(output, concreteOutputCopy) + + // TODO: Regex.Match: init from tuple match and as to tuple match + + // TODO: Regex: init from tuple regex and as cast to tuple regex + + } + + func testDynamicCaptures() throws { + do { + let regex = try Regex("aabcc.") + let line = "aabccd" + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + } + do { + let regex = try Regex( + #""" + (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* + """#) + let line = """ + A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ + COMBINING MARK TUKWENTIS + """ + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + XCTAssertTrue(output[1].substring == "A6F0") + XCTAssertTrue(output["lower"]?.substring == "A6F0") + XCTAssertTrue(output[2].substring == "A6F1") + XCTAssertTrue(output["upper"]?.substring == "A6F1") + XCTAssertTrue(output[3].substring == "Extend") + XCTAssertTrue(output["desc"]?.substring == "Extend") + let typedOutput = try XCTUnwrap(output.as( + (Substring, lower: Substring, upper: Substring?, Substring).self)) + XCTAssertEqual(typedOutput.0, line[...]) + XCTAssertTrue(typedOutput.lower == "A6F0") + XCTAssertTrue(typedOutput.upper == "A6F1") + XCTAssertTrue(typedOutput.3 == "Extend") + } + } +} diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index b48e1f0a5..9efbf2f76 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -16,36 +16,44 @@ import XCTest extension CaptureList.Capture { static var cap: Self { - return Self(optionalDepth: 0) + return Self(optionalDepth: 0, .fake) } static var opt: Self { - return Self(optionalDepth: 1) + return Self(optionalDepth: 1, .fake) } static var opt_opt: Self { - return Self(optionalDepth: 2) + return Self(optionalDepth: 2, .fake) } static var opt_opt_opt: Self { - return Self(optionalDepth: 3) + return Self(optionalDepth: 3, .fake) } static var opt_opt_opt_opt: Self { - return Self(optionalDepth: 4) + return Self(optionalDepth: 4, .fake) } static var opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 5) + return Self(optionalDepth: 5, .fake) } static var opt_opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 6) + return Self(optionalDepth: 6, .fake) } - static func named(_ name: String) -> Self { - return Self(name: name, optionalDepth: 0) + static func named(_ name: String, opt: Int = 0) -> Self { + return Self(name: name, optionalDepth: opt, .fake) } } extension CaptureList { static func caps(count: Int) -> Self { Self(Array(repeating: .cap, count: count)) } + + var withoutLocs: Self { + var copy = self + for idx in copy.captures.indices { + copy.captures[idx].location = .fake + } + return copy + } } extension StructuredCapture { @@ -150,8 +158,8 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .traditional) - let capList = ast.root._captureList + let ast = try! parse(regex, .semantic, .traditional) + let capList = ast.root._captureList.withoutLocs guard capList == expected else { XCTFail(""" Expected: diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 428020b80..0100a3a86 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .traditional).root + let ast = try! parse("(a)", .semantic, .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .traditional).root else { + guard let ast = try? parse(str, .semantic, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -54,7 +54,7 @@ extension RegexTests { func renderTest(_ str: String, _ expected: [String]) { let lines = try! parse( - str, .traditional + str, .semantic, .traditional )._render(in: str) func fail() { XCTFail(""" diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 345e80e22..36056e85a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -169,6 +169,8 @@ func firstMatchTest( XCTAssertEqual(found, match, file: file, line: line) } } catch { + // FIXME: This allows non-matches to succeed even when xfail'd + // When xfail == true, this should report failure for match == nil if !xfail && match != nil { XCTFail("\(error)", file: file, line: line) } @@ -182,7 +184,9 @@ func firstMatchTests( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #filePath, + line: UInt = #line ) { for (input, match) in tests { firstMatchTest( @@ -192,7 +196,9 @@ func firstMatchTests( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -279,7 +285,20 @@ extension RegexTests { firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence - firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"3\u{ 61 62 63 }"#, input: "123abcxyz", match: "3abc") + firstMatchTest(#"\u{61 62}\u{63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"\u{61}\u{62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"9|\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"(?:\u{61 62 63})"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"23\u{61 62 63}xy"#, input: "123abcxyz", match: "23abcxy") + + // o + horn + dot_below + firstMatchTest( + #"\u{006f 031b 0323}"#, + input: "\u{006f}\u{031b}\u{0323}", + match: "\u{006f}\u{031b}\u{0323}" + ) // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, @@ -400,7 +419,8 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil)) + ("bb", nil), + xfail: true) firstMatchTests( "a+?a", ("babc", nil), @@ -462,15 +482,11 @@ extension RegexTests { "a{2,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) firstMatchTests( "a{,4}+a", - ("babc", nil), - ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) @@ -478,11 +494,44 @@ extension RegexTests { "a{2,}+a", ("babc", nil), ("baabc", nil), + ("bb", nil)) + + // XFAIL'd versions of the above + firstMatchTests( + "a{2,4}+a", + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{,4}+a", + ("babc", nil), + ("baabc", nil), + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), ("baaaaaaaabc", nil), - ("bb", nil)) + xfail: true) + // XFAIL'd possessive tests + firstMatchTests( + "a?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a)?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a){2,4}+a", + ("a", nil), + ("aa", nil)) + firstMatchTests( + "(a|a){2,4}+a", + ("aaa", nil), + ("aaaa", nil), + xfail: true) firstMatchTests( "(?:a{2,4}?b)+", @@ -681,7 +730,7 @@ extension RegexTests { firstMatchTest( #"\N{ASTERISK}+"#, input: "123***xyz", match: "***") firstMatchTest( - #"\N {2}"#, input: "123 xyz", match: "3 ") + #"\N {2}"#, input: "123 xyz", match: "3 ", xfail: true) firstMatchTest(#"\N{U+2C}"#, input: "123,xyz", match: ",") firstMatchTest(#"\N{U+1F4BF}"#, input: "123💿xyz", match: "💿") @@ -693,6 +742,14 @@ extension RegexTests { firstMatchTest(#"\p{gc=L}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\p{Lu}"#, input: "123abcXYZ", match: "X") + // U+0374 GREEK NUMERAL SIGN (Lm) + // U+00AA FEMININE ORDINAL INDICATOR (Lo) + firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") firstMatchTest( @@ -938,15 +995,19 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( - #"\u{65}"#, // Scalar 'e' is present in both: - ("Cafe\u{301}", "e"), // composed and - ("Sol Cafe", "e")) // standalone + #"\u{65}"#, // Scalar 'e' is present in both + ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match + xfail: true) + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both + ("Sol Cafe", "e")) // standalone is okay + firstMatchTests( #"\u{65}\y"#, // Grapheme boundary assertion ("Cafe\u{301}", nil), ("Sol Cafe", "e")) firstMatchTests( - #"\u{65}\Y"#, // Grapheme non-boundary assertion + #"(?u)\u{65}\Y"#, // Grapheme non-boundary assertion ("Cafe\u{301}", "e"), ("Sol Cafe", nil)) } @@ -966,7 +1027,7 @@ extension RegexTests { firstMatchTest( #"a(?:b)c"#, input: "123abcxyz", match: "abc") firstMatchTest( - "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a") + "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a", xfail: true) firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") @@ -1082,6 +1143,8 @@ extension RegexTests { firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + + firstMatchTest(#"\1(.)"#, input: "112", match: nil) } func testMatchExamples() { @@ -1353,11 +1416,14 @@ extension RegexTests { // as a character. firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character - firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, - xfail: true) + firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e") + firstMatchTest(#"\u{65 301}$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) + + // FIXME: Implicit \y at end of match + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, + xfail: true) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, @@ -1381,12 +1447,10 @@ extension RegexTests { (eComposed, true), (eDecomposed, true)) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character matchTest( #"e\u{301}$"#, (eComposed, true), - (eDecomposed, true), - xfail: true) + (eDecomposed, true)) matchTest( #"e$"#, @@ -1407,9 +1471,7 @@ extension RegexTests { (eDecomposed, true)) // \p{Letter} firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) - // FIXME: \p{Letter} doesn't match a decomposed character - firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed) // \d firstMatchTest(#"\d"#, input: "5", match: "5") @@ -1470,9 +1532,11 @@ extension RegexTests { firstMatchTest(#"🇰🇷"#, input: flag, match: flag) firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag) firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) - + firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) + // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, + xfail: true) // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character // A CCC of regional indicators x 2 @@ -1513,8 +1577,7 @@ extension RegexTests { // FIXME: \O is unsupported firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) - firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\O"#, input: eComposed, match: eComposed) firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index aeefe6477..ed930b0fe 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,30 +33,56 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } +enum SemanticErrorKind { + case unsupported, invalid +} class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), syntax: syntax, - captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, + syntax: syntax, captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax) + let ast: AST + do { + ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parse(input, .semantic, syntax) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -68,7 +94,7 @@ func parseTest( file: file, line: line) return } - let captures = ast.captureList + let captures = ast.captureList.withoutLocs guard captures == expectedCaptures else { XCTFail(""" @@ -143,15 +169,37 @@ func delimiterLexingTest( /// true, there may be additional characters that follow the literal that are /// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, - file: StaticString = #file, line: UInt = #line + _ input: String, _ expecting: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(literal) - let ast = orig.root + let ast: AST.Node + do { + ast = try parseWithDelimiters( + literal, errorKind != nil ? .syntactic : .semantic).root + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parseWithDelimiters(input, .semantic) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround else { @@ -170,8 +218,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, syntax) - let rhsAST = try! parse(rhs, syntax) + let lhsAST = try! parse(lhs, .syntactic, syntax) + let rhsAST = try! parse(rhs, .syntactic, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -187,7 +235,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax).root + let ast = try! parse(input, .syntactic, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -207,7 +255,7 @@ func diagnosticTest( file: StaticString = #file, line: UInt = #line ) { do { - let ast = try parse(input, syntax) + let ast = try parse(input, .semantic, syntax) XCTFail(""" Passed \(ast) @@ -236,7 +284,7 @@ func diagnosticWithDelimitersTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) do { - let orig = try parseWithDelimiters(literal) + let orig = try parseWithDelimiters(literal, .semantic) let ast = orig.root XCTFail(""" @@ -433,10 +481,32 @@ extension RegexTests { parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X")) parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b")) + parseTest(#"\u{ a }"#, scalar("\u{A}")) + parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}"))) + + // MARK: Scalar sequences + + parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A b C }"#, scalarSeq("\u{A}", "\u{B}", "\u{C}")) + + parseTest( + #"\u{3b1 3b3 3b5 3b9}"#, + scalarSeq("\u{3b1}", "\u{3b3}", "\u{3b5}", "\u{3b9}") + ) + // MARK: Character classes parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // FIXME: '\N' should be emitted through 'emitAny', not through the + // _CharacterClassModel model. + parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + + parseTest(#"\R"#, escaped(.newlineSequence)) + parseTest( "[-|$^:?+*())(*-+-]", charClass( @@ -449,6 +519,8 @@ extension RegexTests { parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[a-z]", charClass(range_m("a", "z"))) + parseTest("[a-a]", charClass(range_m("a", "a"))) + parseTest("[B-a]", charClass(range_m("B", "a"))) // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( @@ -595,10 +667,34 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - )) + ), throwsError: .unsupported) + + parseTest( + #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), + throwsError: .unsupported) + + parseTest( + #"[\u{AA}-\u{BB}]"#, + charClass(range_m(scalar_a("\u{AA}"), scalar_a("\u{BB}"))) + ) - parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( - range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + // Not currently supported, we need to figure out what their semantics are. + parseTest( + #"[\u{AA BB}-\u{CC}]"#, + charClass(range_m(scalarSeq_a("\u{AA}", "\u{BB}"), scalar_a("\u{CC}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{CC}-\u{AA BB}]"#, + charClass(range_m(scalar_a("\u{CC}"), scalarSeq_a("\u{AA}", "\u{BB}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{a b c}]"#, + charClass(scalarSeq_m("\u{A}", "\u{B}", "\u{C}")), + throwsError: .unsupported + ) // MARK: Operators @@ -691,13 +787,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" "))) - parseTest(#"\c!"#, atom(.keyboardControl("!"))) - parseTest(#"\c~"#, atom(.keyboardControl("~"))) - parseTest(#"\C--"#, atom(.keyboardControl("-"))) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) - parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) + parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) + parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) + parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) // MARK: Comments @@ -734,6 +830,9 @@ extension RegexTests { parseTest( #"a{0,0}"#, quantRange(0...0, of: "a")) + parseTest( + #"a{1,1}"#, + quantRange(1...1, of: "a")) // Make sure ranges get treated as literal if invalid. parseTest("{", "{") @@ -786,11 +885,42 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - captures: [.cap]) + throwsError: .unsupported, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) + + // Capture resets. + // FIXME: The captures in each branch should be unified. For now, we don't + // treat any capture reset as semantically valid. + parseTest( + "(?|(a)|(b))", + nonCaptureReset(alt(capture("a"), capture("b"))), + throwsError: .unsupported, captures: [.opt, .opt] + ) + parseTest( + "(?|(?a)|(b))", + nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + ) + parseTest( + "(?|(a)|(?b))", + nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + ) + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), + throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] + ) + + // TODO: Reject mismatched names? + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + ) // Other groups parseTest( @@ -798,13 +928,13 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c")) + concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c")) @@ -815,31 +945,42 @@ extension RegexTests { parseTest("a(*negative_lookahead:b)c", concat("a", negativeLookahead("b"), "c")) - parseTest("a(?<=b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*plb:b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*positive_lookbehind:b)c", concat("a", lookbehind("b"), "c")) - - parseTest("a(?)()", concat( + changeMatchingOptions(matchingOptions(adding: .namedCapturesOnly)), + changeMatchingOptions(unsetMatchingOptions(), capture(empty())), + namedCapture("x", empty()), + nonCapture(empty()) + ), captures: [.cap, .named("x")]) + // MARK: References // \1 ... \9 are always backreferences. for i in 1 ... 9 { - parseTest("\\\(i)", backreference(.absolute(i))) + parseTest("\\\(i)", backreference(.absolute(i)), throwsError: .invalid) parseTest( "()()()()()()()()()\\\(i)", concat(Array(repeating: capture(empty()), count: 9) @@ -986,10 +1134,10 @@ extension RegexTests { ) } - parseTest(#"\10"#, backreference(.absolute(10))) - parseTest(#"\18"#, backreference(.absolute(18))) - parseTest(#"\7777"#, backreference(.absolute(7777))) - parseTest(#"\91"#, backreference(.absolute(91))) + parseTest(#"\10"#, backreference(.absolute(10)), throwsError: .invalid) + parseTest(#"\18"#, backreference(.absolute(18)), throwsError: .invalid) + parseTest(#"\7777"#, backreference(.absolute(7777)), throwsError: .invalid) + parseTest(#"\91"#, backreference(.absolute(91)), throwsError: .invalid) parseTest( #"()()()()()()()()()()\10"#, @@ -1005,7 +1153,7 @@ extension RegexTests { ) parseTest(#"()()\10"#, concat( capture(empty()), capture(empty()), backreference(.absolute(10))), - captures: [.cap, .cap] + throwsError: .invalid, captures: [.cap, .cap] ) // A capture of three empty captures. @@ -1016,7 +1164,7 @@ extension RegexTests { // There are 9 capture groups in total here. #"((()()())(()()()))\10"#, concat(capture(concat( fourCaptures, fourCaptures)), backreference(.absolute(10))), - captures: .caps(count: 9) + throwsError: .invalid, captures: .caps(count: 9) ) parseTest( // There are 10 capture groups in total here. @@ -1040,7 +1188,7 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .caps(count: 40) ) - parseTest(#"\40"#, backreference(.absolute(40))) + parseTest(#"\40"#, backreference(.absolute(40)), throwsError: .invalid) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) @@ -1048,14 +1196,14 @@ extension RegexTests { captures: .caps(count: 40) ) - parseTest(#"\7"#, backreference(.absolute(7))) + parseTest(#"\7"#, backreference(.absolute(7)), throwsError: .invalid) - parseTest(#"\11"#, backreference(.absolute(11))) + parseTest(#"\11"#, backreference(.absolute(11)), throwsError: .invalid) parseTest( - String(repeating: "()", count: 11) + #"\11"#, - concat(Array(repeating: capture(empty()), count: 11) + String(repeating: "()", count: 12) + #"\11"#, + concat(Array(repeating: capture(empty()), count: 12) + [backreference(.absolute(11))]), - captures: .caps(count: 11) + captures: .caps(count: 12) ) parseTest(#"\011"#, scalar("\u{9}")) parseTest( @@ -1065,64 +1213,78 @@ extension RegexTests { ) parseTest(#"\0113"#, scalar("\u{4B}")) - parseTest(#"\113"#, backreference(.absolute(113))) - parseTest(#"\377"#, backreference(.absolute(377))) - parseTest(#"\81"#, backreference(.absolute(81))) - - parseTest(#"\g1"#, backreference(.absolute(1))) - parseTest(#"\g001"#, backreference(.absolute(1))) - parseTest(#"\g52"#, backreference(.absolute(52))) - parseTest(#"\g-01"#, backreference(.relative(-1))) - parseTest(#"\g+30"#, backreference(.relative(30))) - - parseTest(#"\g{1}"#, backreference(.absolute(1))) - parseTest(#"\g{001}"#, backreference(.absolute(1))) - parseTest(#"\g{52}"#, backreference(.absolute(52))) - parseTest(#"\g{-01}"#, backreference(.relative(-1))) - parseTest(#"\g{+30}"#, backreference(.relative(30))) - parseTest(#"\k<+4>"#, backreference(.relative(4))) - parseTest(#"\k<2>"#, backreference(.absolute(2))) - parseTest(#"\k'-3'"#, backreference(.relative(-3))) - parseTest(#"\k'1'"#, backreference(.absolute(1))) - - parseTest(#"\k{a0}"#, backreference(.named("a0"))) - parseTest(#"\k"#, backreference(.named("bc"))) - parseTest(#"\g{abc}"#, backreference(.named("abc"))) - parseTest(#"(?P=abc)"#, backreference(.named("abc"))) + parseTest(#"\113"#, backreference(.absolute(113)), throwsError: .invalid) + parseTest(#"\377"#, backreference(.absolute(377)), throwsError: .invalid) + parseTest(#"\81"#, backreference(.absolute(81)), throwsError: .invalid) + + parseTest(#"\g1"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g001"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g52"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g-01"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g+30"#, backreference(.relative(30)), throwsError: .unsupported) + + parseTest(#"\g{1}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{001}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{52}"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g{-01}"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g{+30}"#, backreference(.relative(30)), throwsError: .unsupported) + parseTest(#"\k<+4>"#, backreference(.relative(4)), throwsError: .unsupported) + parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) + parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) + parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0)) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0)) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1)) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8)) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8)) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8)) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8)) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8)) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?1)"#, subpattern(.absolute(1))) - parseTest(#"(?+12)"#, subpattern(.relative(12))) - parseTest(#"(?-2)"#, subpattern(.relative(-2))) - parseTest(#"(?&hello)"#, subpattern(.named("hello"))) - parseTest(#"(?P>P)"#, subpattern(.named("P"))) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) + + parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) + parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) + parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1))) - parseTest(#"\g<001>"#, subpattern(.absolute(1))) - parseTest(#"\g'52'"#, subpattern(.absolute(52))) - parseTest(#"\g'-01'"#, subpattern(.relative(-1))) - parseTest(#"\g'+30'"#, subpattern(.relative(30))) - parseTest(#"\g'abc'"#, subpattern(.named("abc"))) + parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) + parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) + // These are valid references. + parseTest(#"()\1"#, concat( + capture(empty()), backreference(.absolute(1)) + ), captures: [.cap]) + parseTest(#"\1()"#, concat( + backreference(.absolute(1)), capture(empty()) + ), captures: [.cap]) + parseTest(#"()()\2"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(2)) + ), captures: [.cap, .cap]) + parseTest(#"()\2()"#, concat( + capture(empty()), backreference(.absolute(2)), capture(empty()) + ), captures: [.cap, .cap]) + // MARK: Character names. parseTest(#"\N{abc}"#, atom(.namedCharacter("abc"))) @@ -1130,7 +1292,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")) + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1156,6 +1318,9 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) + // L& defined by PCRE. + parseTest(#"\p{L&}"#, prop(.generalCategory(.casedLetter))) + // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter) @@ -1193,13 +1358,13 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic))) + parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric))) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace))) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace))) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed))) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord))) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1209,48 +1374,55 @@ extension RegexTests { parseTest(#"\p{word}"#, prop(.posix(.word))) parseTest(#"\p{xdigit}"#, prop(.posix(.xdigit))) + parseTest(#"\p{name=A}"#, prop(.named("A"))) + parseTest(#"\p{Name=B}"#, prop(.named("B"))) + parseTest(#"\p{isName=C}"#, prop(.named("C"))) + parseTest(#"\p{na=D}"#, prop(.named("D"))) + parseTest(#"\p{NA=E}"#, prop(.named("E"))) + parseTest(#"\p{na=isI}"#, prop(.named("isI"))) + // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), captures: [.opt]) + ), throwsError: .unsupported, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest( #"(?)(?(a+5))"#, @@ -1258,7 +1430,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a")] + throwsError: .unsupported, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1266,50 +1438,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a1")] + throwsError: .unsupported, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()))) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b")) + .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), captures: [.cap]) + ), throwsError: .unsupported, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), captures: [.named("xxx")]) + ), throwsError: .unsupported, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1339,115 +1511,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a")) + falseBranch: "a"), throwsError: .unsupported) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty())) + .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0))) - parseTest(#"(?C0)"#, pcreCallout(.number(0))) - parseTest(#"(?C20)"#, pcreCallout(.number(20))) - parseTest("(?C{abc})", pcreCallout(.string("abc"))) + parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) + parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello"))) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), + throwsError: .unsupported) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X")) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t")) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b")) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b")) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c")) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c")) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>")) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c")) + parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x")) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y")) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x")) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag")) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction)) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both)) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x")) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x")) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\")) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept))) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) parseTest( "(*ACCEPT:a)??", - zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")) + zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), + throwsError: .unsupported ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*F)", backtrackingDirective(.fail)) - parseTest("(*COMMIT)", backtrackingDirective(.commit)) - parseTest("(*SKIP)", backtrackingDirective(.skip)) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP")) - parseTest("(*PRUNE)", backtrackingDirective(.prune)) - parseTest("(*THEN)", backtrackingDirective(.then)) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) + parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) + parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) + parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) + parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty())) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a"))) - parseTest("(?~~)", absentRepeater("~")) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) - parseTest("(?~(a))", absentRepeater(capture("a")), captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty()))) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a"))) - parseTest("(?~|~)", absentStopper("~")) - parseTest("(?~|(a))", absentStopper(capture("a")), captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a"))) - - parseTest("(?~|a|b)", absentExpression("a", "b")) - parseTest("(?~|~|~)", absentExpression("~", "~")) + parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) + parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) + + parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) + parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - captures: []) + throwsError: .unsupported, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b"))) + ), throwsError: .unsupported, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) - parseTest("(?~|)", absentRangeClear()) + parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear())) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, .limitDepth(.init(faking: 3)) - )) + ), throwsError: .unsupported) parseTest( - "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode))) + "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), + throwsError: .unsupported) parseTest( "(*BSR_ANYCRLF)", ast( - empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed))) + empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), + throwsError: .unsupported) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1455,7 +1631,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) })) + ].map { .newlineMatching($0) }), throwsError: .unsupported) parseTest( """ @@ -1468,7 +1644,7 @@ extension RegexTests { .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ) + ), throwsError: .unsupported ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1682,7 +1858,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1693,7 +1869,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1704,7 +1880,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1715,7 +1891,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1726,7 +1902,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1737,7 +1913,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1748,7 +1924,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1759,7 +1935,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1770,7 +1946,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1791,7 +1967,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1802,7 +1978,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1813,7 +1989,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1827,7 +2003,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) // MARK: Parse with delimiters @@ -1923,6 +2099,26 @@ extension RegexTests { """, changeMatchingOptions(matchingOptions(adding: .extended)) ) + parseWithDelimitersTest(#""" + #/ + \p{ + gc + = + digit + } + /# + """#, prop(.generalCategory(.decimalNumber))) + + parseWithDelimitersTest(#""" + #/ + \u{ + aB + B + c + } + /# + """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -1930,30 +2126,37 @@ extension RegexTests { #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, - balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), + throwsError: .unsupported) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( - .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), + throwsError: .unsupported + ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( - .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) - + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) parseWithDelimitersTest( - #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#))) + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)), + throwsError: .unsupported) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -1990,6 +2193,12 @@ extension RegexTests { parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) + parseNotEqualTest(#"\u{A}"#, #"\u{B}"#) + parseNotEqualTest(#"\u{A B}"#, #"\u{B A}"#) + parseNotEqualTest(#"\u{AB}"#, #"\u{A B}"#) + parseNotEqualTest(#"[\u{AA BB}-\u{CC}]"#, #"[\u{AA DD}-\u{CC}]"#) + parseNotEqualTest(#"[\u{AA BB}-\u{DD}]"#, #"[\u{AA BB}-\u{CC}]"#) + parseNotEqualTest(#"[abc[:space:]\d]+"#, #"[abc[:upper:]\d]+"#) @@ -2117,6 +2326,20 @@ extension RegexTests { $0.as(CustomCC.self)!.members[0].as(CustomCC.Range.self)!.dashLoc }) + // MARK: Unicode scalars + + rangeTest(#"\u{65}"#, range(3 ..< 5), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location + }) + + rangeTest(#"\u{ 65 58 }"#, range(5 ..< 7), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[0].location + }) + + rangeTest(#"\u{ 65 58 }"#, range(8 ..< 10), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[1].location + }) + // MARK: References rangeTest(#"\k"#, range(3 ..< 6), at: { @@ -2297,6 +2520,13 @@ extension RegexTests { diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty) + diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) + diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2323,6 +2553,7 @@ extension RegexTests { diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) + diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}")) // MARK: Character properties @@ -2334,6 +2565,10 @@ extension RegexTests { diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + // We only filter pattern whitespace, which doesn't include things like + // non-breaking spaces. + diagnosticTest(#"\p{L\#u{A0}l}"#, .unknownProperty(key: nil, value: "L\u{A0}l")) + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) @@ -2394,6 +2629,12 @@ extension RegexTests { diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) + diagnosticTest("(?)(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)|(?)", .duplicateNamedCapture("x")) + diagnosticTest("((?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(|(?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)(?)(?)", .duplicateNamedCapture("x")) + // MARK: Quantifiers diagnosticTest("*", .quantifierRequiresOperand("*")) @@ -2402,11 +2643,43 @@ extension RegexTests { diagnosticTest("*?", .quantifierRequiresOperand("*?")) diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + + // These are not quantifiable. + diagnosticTest(#"\b?"#, .notQuantifiable) + diagnosticTest(#"\B*"#, .notQuantifiable) + diagnosticTest(#"\A+"#, .notQuantifiable) + diagnosticTest(#"\Z??"#, .notQuantifiable) + diagnosticTest(#"\G*?"#, .notQuantifiable) + diagnosticTest(#"\z+?"#, .notQuantifiable) + diagnosticTest(#"^*"#, .notQuantifiable) + diagnosticTest(#"$?"#, .notQuantifiable) + diagnosticTest(#"(?=a)+"#, .notQuantifiable) + diagnosticTest(#"(?i)*"#, .notQuantifiable) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\y{2,5}"#, .notQuantifiable) + diagnosticTest(#"\Y{3,}"#, .notQuantifiable) // MARK: Unicode scalars diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{}"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ GH }"#, .expectedNumber("GH", kind: .hex)) + diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ ABC G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ FFFFFFFFF A }"#, .numberOverflow("FFFFFFFFF")) + + diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class")) + diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) @@ -2441,6 +2714,16 @@ extension RegexTests { diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) + diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) + + diagnosticTest(#"\k<0>"#, .cannotReferToWholePattern) + diagnosticTest(#"\1"#, .invalidReference(1)) + diagnosticTest(#"(?:)\1"#, .invalidReference(1)) + diagnosticTest(#"()\2"#, .invalidReference(2)) + diagnosticTest(#"\2()"#, .invalidReference(2)) + diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) + diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) // MARK: Conditionals @@ -2479,13 +2762,13 @@ extension RegexTests { diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .notQuantifiable) - diagnosticTest("(*FAIL)+", .notQuantifiable) - diagnosticTest("(*COMMIT:b)*", .notQuantifiable) - diagnosticTest("(*PRUNE:a)??", .notQuantifiable) - diagnosticTest("(*SKIP:a)*?", .notQuantifiable) - diagnosticTest("(*F)+?", .notQuantifiable) - diagnosticTest("(*:a){2}", .notQuantifiable) + diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) + diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) + diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) + diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) + diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) + diagnosticTest("(*F)+?", .unsupported("backtracking directive")) + diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) // MARK: Oniguruma absent functions @@ -2543,5 +2826,9 @@ extension RegexTests { func testCompilerInterfaceDiagnostics() { compilerInterfaceDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") + compilerInterfaceDiagnosticMessageTest( + "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") + compilerInterfaceDiagnosticMessageTest( + #"#/\u{}/#"#, "cannot parse regular expression: expected hexadecimal number") } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift new file mode 100644 index 000000000..d13b47b8d --- /dev/null +++ b/Tests/RegexTests/UTS18Tests.swift @@ -0,0 +1,618 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// This test suite includes tests that verify the behavior of `Regex` as it +// relates to Unicode Technical Standard #18: Unicode Regular Expressions. +// +// Please note: Quotations of UTS18 in this file mostly use 'Character' to mean +// Unicode code point, and 'String' to mean 'sequence of code points' — they +// are not the Swift meanings of those terms. +// +// See https://unicode.org/reports/tr18/ for more. + +import XCTest +@testable // for internal `matches(of:)` +import _StringProcessing + +extension UnicodeScalar { + var value4Digits: String { + let valueString = String(value, radix: 16, uppercase: true) + if valueString.count >= 4 { return valueString } + return String(repeating: "0", count: 4 - valueString.count) + valueString + } +} + +class UTS18Tests: XCTestCase { + var input: String { + "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" + // 01234567890 1 234567890 + // 0 10 20 + } +} + +fileprivate func regex(_ pattern: String) -> Regex { + try! Regex(pattern, as: Substring.self) +} + +fileprivate extension String { + subscript(pos bounds: R) -> Substring + where R.Bound == Int + { + let bounds = bounds.relative(to: 0..( + _ input: String, + _ r: Regex, + _ output: Output, + file: StaticString = #file, + line: UInt = #line) +{ + XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line) +} + +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {} +#endif + +// MARK: - Basic Unicode Support: Level 1 + +// C1. An implementation claiming conformance to Level 1 of this specification +// shall meet the requirements described in the following sections: +extension UTS18Tests { + // RL1.1 Hex Notation + // + // To meet this requirement, an implementation shall supply a mechanism for + // specifying any Unicode code point (from U+0000 to U+10FFFF), using the + // hexadecimal code point representation. + func testHexNotation() { + expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") + expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + } + + // 1.1.1 Hex Notation and Normalization + // + // TODO: Does this section make a recommendation? + + // RL1.2 Properties + // To meet this requirement, an implementation shall provide at least a + // minimal list of properties, consisting of the following: + // - General_Category + // - Script and Script_Extensions + // - Alphabetic + // - Uppercase + // - Lowercase + // - White_Space + // - Noncharacter_Code_Point + // - Default_Ignorable_Code_Point + // - ANY, ASCII, ASSIGNED + // The values for these properties must follow the Unicode definitions, and + // include the property and property value aliases from the UCD. Matching of + // Binary, Enumerated, Catalog, and Name values must follow the Matching + // Rules from [UAX44] with one exception: implementations are not required + // to ignore an initial prefix string of "is" in property values. + func testProperties() { + // General_Category + expectFirstMatch(input, regex(#"\p{Lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercase letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase_Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercaseletter}+"#), input[pos: ..<3]) + + expectFirstMatch(input, regex(#"\p{P}+"#), "-–—[]") + expectFirstMatch(input, regex(#"\p{Pd}+"#), "-–—") + + expectFirstMatch(input, regex(#"\p{Any}+"#), input[...]) + expectFirstMatch(input, regex(#"\p{Assigned}+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"\p{ASCII}+"#), input[pos: ..<8]) + + // Script and Script_Extensions + // U+3042 あ HIRAGANA LETTER A Hira {Hira} + XCTAssertTrue("\u{3042}".contains(regex(#"\p{Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{scx=Hira}"#))) + // U+30FC ー KATAKANA-HIRAGANA PROLONGED SOUND MARK Zyyy = Common {Hira, Kana} + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Hira}"#))) // Implicit = Script_Extensions + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Kana}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{sc=Zyyy}"#))) // Explicit = Script + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Hira}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Kana}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Kana}"#))) + + // Uppercase, etc + expectFirstMatch(input, regex(#"\p{Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{isUppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase=true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is uppercase = true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lowercase}+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"\p{whitespace}+"#), input[pos: 12..<13]) + + // Block vs Writing System + let greekScalar = "Θ" // U+0398 + let greekExtendedScalar = "ἀ" // U+1F00 + XCTAssertTrue(greekScalar.contains(regex(#"\p{Greek}"#))) + XCTAssertTrue(greekExtendedScalar.contains(regex(#"\p{Greek}"#))) + } + + func testProperties_XFail() { + XCTExpectFailure("Need to support 'age' and 'block' properties") { + // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) + XCTFail(#"\(#/\p{age=3.1}/#)"#) + // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) + XCTFail(#"\(#/\p{Block=Greek}/#)"#) + } + } + + // RL1.2a Compatibility Properties + // To meet this requirement, an implementation shall provide the properties + // listed in Annex C: Compatibility Properties, with the property values as + // listed there. Such an implementation shall document whether it is using + // the Standard Recommendation or POSIX-compatible properties. + func testCompatibilityProperties() throws { + // FIXME: These tests seem insufficient + expectFirstMatch(input, regex(#"[[:alpha:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:upper:]]+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"[[:lower:]]+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"[[:punct:]]+"#), input[pos: 13..<18]) + expectFirstMatch(input, regex(#"[[:digit:]]+"#), input[pos: 18..<21]) + expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) + expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) + // TODO: blank + // TODO: cntrl + expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) + expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + } + + //RL1.3 Subtraction and Intersection + // + // To meet this requirement, an implementation shall supply mechanisms for + // union, intersection and set-difference of sets of characters within + // regular expression character class expressions. + func testSubtractionAndIntersection() throws { + // Non-ASCII letters + expectFirstMatch(input, regex(#"[\p{Letter}--\p{ASCII}]+"#), input[pos: 8..<11]) + // Digits that aren't 1 or 2 + expectFirstMatch(input, regex(#"[\p{digit}--[12]]+"#), input[pos: 20..<21]) + + // ASCII-only letters + expectFirstMatch(input, regex(#"[\p{Letter}&&\p{ASCII}]+"#), input[pos: ..<8]) + // Digits that are 2 or 3 + expectFirstMatch(input, regex(#"[\p{digit}&&[23]]+"#), input[pos: 19..<21]) + + // Non-ASCII lowercase + non-lowercase ASCII + expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) + XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + } + + func testSubtractionAndIntersectionPrecedence() { + expectFirstMatch("ABC123-", regex(#"[[:alnum:]]*-"#), "ABC123-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}]*-"#), "123-") + // Union binds more closely than difference + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}[:digit:]]*-"#), "-") + // TODO: Test for intersection precedence + } + + // RL1.4 Simple Word Boundaries + // To meet this requirement, an implementation shall extend the word boundary + // mechanism so that: + // - The class of includes all the Alphabetic values from the + // Unicode character database, from UnicodeData.txt, plus the decimals + // (General_Category=Decimal_Number, or equivalently Numeric_Type=Decimal), + // and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER + // (Join_Control=True). See also Annex C: Compatibility Properties. + // - Nonspacing marks are never divided from their base characters, and + // otherwise ignored in locating boundaries. + func testSimpleWordBoundaries() { + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) + expectFirstMatch("don't", simpleWordRegex, "don") + expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") + } + + // RL1.5 Simple Loose Matches + // + // To meet this requirement, if an implementation provides for case- + // insensitive matching, then it shall provide at least the simple, default + // Unicode case-insensitive matching, and specify which properties are closed + // and which are not. + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the simple, default Unicode + // case folding. + func testSimpleLooseMatches() { + expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") + expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") + expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") + } + + func testSimpleLooseMatches_XFail() { + XCTExpectFailure("Need case folding support") { + let sigmas = "σΣς" + expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) + + // TODO: Test German sharp S + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + } + } + + // RL1.6 Line Boundaries + // + // To meet this requirement, if an implementation provides for line-boundary + // testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085), + // PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028). + func testLineBoundaries() { + let lineInput = """ + 01 + 02\r\ + 03\n\ + 04\u{a}\ + 05\u{b}\ + 06\u{c}\ + 07\u{d}\ + 08\u{d}\u{a}\ + 09\u{85}\ + 10\u{2028}\ + 11\u{2029}\ + 12 + """ + // Check the input counts + var lines = lineInput.matches(of: regex(#"\d{2}"#)) + XCTAssertEqual(lines.count, 12) + // Test \R - newline sequence + lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test \v - vertical space + lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test anchors as line boundaries + lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 12) + // Test that dot does not match line endings + lines = lineInput.matches(of: regex(#".+"#)) + XCTAssertEqual(lines.count, 12) + + // Unicode scalar semantics - \R still matches all, including \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Unicode scalar semantics - \v matches all except for \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 10) + + // Does not contain an empty line + XCTAssertFalse(lineInput.contains(regex(#"^$"#))) + // Does contain an empty line (between \n and \r, which are reversed here) + let empty = "\n\r" + XCTAssertTrue(empty.contains(regex(#"^$"#).anchorsMatchLineEndings())) + } + + // RL1.7 Supplementary Code Points + // + // To meet this requirement, an implementation shall handle the full range of + // Unicode code points, including values from U+FFFF to U+10FFFF. In + // particular, where UTF-16 is used, a sequence consisting of a leading + // surrogate followed by a trailing surrogate shall be handled as a single + // code point in matching. + func testSupplementaryCodePoints() { + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) + XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) + XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) + } +} + +// MARK: - Extended Unicode Support: Level 2 + +// C2. An implementation claiming conformance to Level 2 of this specification +// shall satisfy C1, and meet the requirements described in the following +// sections: +extension UTS18Tests { + // RL2.1 Canonical Equivalents + // + // Specific recommendation? + func testCanonicalEquivalents() { + let equivalents = [ + "\u{006f}\u{031b}\u{0323}", // o + horn + dot_below + "\u{006f}\u{0323}\u{031b}", // o + dot_below + horn + "\u{01a1}\u{0323}", // o-horn + dot_below + "\u{1ecd}\u{031b}", // o-dot_below + horn + "\u{1ee3}", // o-horn-dot_below + ] + + let regexes = [ + regex(#"\u{006f}\u{031b}\u{0323}"#), // o + horn + dot_below + regex(#"\u{006f}\u{0323}\u{031b}"#), // o + dot_below + horn + regex(#"\u{01a1}\u{0323}"#), // o-horn + dot_below + regex(#"\u{1ecd}\u{031b}"#), // o-dot_below + horn + regex(#"\u{1ee3}"#), // o-horn-dot_below + ] + + // Default: Grapheme cluster semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + XCTAssertTrue( + equiv.contains(regex), + "Grapheme cluster semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } + } + + // Unicode scalar semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + let regex = regex.matchingSemantics(.unicodeScalar) + if regexNum == equivNum { + XCTAssertTrue( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } else { + XCTAssertFalse( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) incorrectly matched with string \(equivNum)") + } + } + } + } + + // RL2.2 Extended Grapheme Clusters and Character Classes with Strings + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching against an arbitrary extended grapheme cluster, Character Classes + // with Strings, and extended grapheme cluster boundaries. + func testExtendedGraphemeClusters() { + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + } + + func testCharacterClassesWithStrings() { + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) + XCTAssertTrue("🧐".contains(regex)) + XCTAssertTrue("🇧🇫".contains(regex)) + } + + // RL2.3 Default Word Boundaries + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching Unicode default word boundaries. + func testDefaultWordBoundaries() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.4 Default Case Conversion + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the full, default Unicode case + // folding. + func testDefaultCaseConversion() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.5 Name Properties + // + // To meet this requirement, an implementation shall support individually + // named characters. + func testNameProperty() throws { + // Name property + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE}"#))) + // Name property and Matching Rules + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=zerowidthno breakspace}"#))) + + // Computed name + XCTAssertTrue("강".contains(regex(#"\p{name=HANGUL SYLLABLE GANG}"#))) + + // Graphic symbol + XCTAssertTrue("\u{1F514}".contains(regex(#"\p{name=BELL}"#))) + + // Name match failures + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BRAKE SPACE}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE ZZZZ}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=z}"#))) + } + + func testNameProperty_XFail() throws { + XCTExpectFailure("Need more expansive name alias matching") { + // Name_Alias property + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BYTE ORDER MARK}"#))) + // Name_Alias property (again) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BOM}"#))) + + // Control character + XCTAssertTrue("\u{7}".contains(regex(#"\p{name=BEL}"#))) + } + } + + func testIndividuallyNamedCharacters() { + XCTAssertTrue("\u{263A}".contains(regex(#"\N{WHITE SMILING FACE}"#))) + XCTAssertTrue("\u{3B1}".contains(regex(#"\N{GREEK SMALL LETTER ALPHA}"#))) + XCTAssertTrue("\u{10450}".contains(regex(#"\N{SHAVIAN LETTER PEEP}"#))) + + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{ZERO WIDTH NO-BREAK SPACE}"#))) + XCTAssertTrue("강".contains(regex(#"\N{HANGUL SYLLABLE GANG}"#))) + XCTAssertTrue("\u{1F514}".contains(regex(#"\N{BELL}"#))) + XCTAssertTrue("🐯".contains(regex(#"\N{TIGER FACE}"#))) + XCTAssertFalse("🐯".contains(regex(#"\N{TIEGR FACE}"#))) + + // Loose matching + XCTAssertTrue("\u{263A}".contains(regex(#"\N{whitesmilingface}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{wHiTe_sMiLiNg_fAcE}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{White Smiling-Face}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{zerowidthno breakspace}"#))) + + // Matching semantic level + XCTAssertFalse("👩‍👩‍👧‍👦".contains(regex(#".\N{ZERO WIDTH JOINER}"#))) + XCTAssertTrue("👩‍👩‍👧‍👦".contains(regex(#"(?u).\N{ZERO WIDTH JOINER}"#))) + } + + func testIndividuallyNamedCharacters_XFail() { + XCTExpectFailure("Need to support named chars in custom character classes") { + XCTFail(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#) + // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) + } + + XCTExpectFailure("Other named char failures -- investigate") { + XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) + XCTAssertTrue("\u{7}".contains(regex(#"\N{BEL}"#))) + } + + XCTExpectFailure("Need to recognize invalid names at compile time") { + XCTFail("This should be a compilation error, not a match failure:") + XCTAssertFalse("abc".contains(regex(#"\N{NOT AN ACTUAL CHARACTER NAME}"#))) + } + } + + // RL2.6 Wildcards in Property Values + // + // To meet this requirement, an implementation shall support wildcards in + // Unicode property values. + func testWildcardsInPropertyValues() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.7 Full Properties + // + // To meet this requirement, an implementation shall support all of the + // properties listed below that are in the supported version of the Unicode + // Standard (or Unicode Technical Standard, respectively), with values that + // match the Unicode definitions for that version. + func testFullProperties() { + // MARK: General + // Name (Name_Alias) + // Block + // Age + // General_Category + // Script (Script_Extensions) + // White_Space + // Alphabetic + // Hangul_Syllable_Type + // Noncharacter_Code_Point + // Default_Ignorable_Code_Point + // Deprecated + // Logical_Order_Exception + // Variation_Selector + + // MARK: Numeric + // Numeric_Value + // Numeric_Type + // Hex_Digit + // ASCII_Hex_Digit + + // MARK: Identifiers + // ID_Continue + // ID_Start + // XID_Continue + // XID_Start + // Pattern_Syntax + // Pattern_White_Space + // Identifier_Status + // Identifier_Type + + // MARK: CJK + // Ideographic + // Unified_Ideograph + // Radical + // IDS_Binary_Operator + // IDS_Trinary_Operator + // Equivalent_Unified_Ideograph + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) + // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + } + + // MARK: Case + // Uppercase + // Lowercase + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Simple_Case_Folding + // Soft_Dotted + // Cased + // Case_Ignorable + // Changes_When_Lowercased + // Changes_When_Uppercased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + // Changes_When_Casefolded + // Changes_When_Casemapped + + // MARK: Normalization + // Canonical_Combining_Class + // Decomposition_Type + // NFC_Quick_Check + // NFKC_Quick_Check + // NFD_Quick_Check + // NFKD_Quick_Check + // NFKC_Casefold + // Changes_When_NFKC_Casefolded + + // MARK: Emoji + // Emoji + // Emoji_Presentation + // Emoji_Modifier + // Emoji_Modifier_Base + // Emoji_Component + // Extended_Pictographic + // Basic_Emoji* + // Emoji_Keycap_Sequence* + // RGI_Emoji_Modifier_Sequence* + // RGI_Emoji_Flag_Sequence* + // RGI_Emoji_Tag_Sequence* + // RGI_Emoji_ZWJ_Sequence* + // RGI_Emoji* + + // MARK: Shaping and Rendering + // Join_Control + // Joining_Group + // Joining_Type + // Vertical_Orientation + // Line_Break + // Grapheme_Cluster_Break + // Sentence_Break + // Word_Break + // East_Asian_Width + // Prepended_Concatenation_Mark + + // MARK: Bidirectional + // Bidi_Class + // Bidi_Control + // Bidi_Mirrored + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket + // Bidi_Paired_Bracket_Type + + // MARK: Miscellaneous + // Math + // Quotation_Mark + // Dash + // Sentence_Terminal + // Terminal_Punctuation + // Diacritic + // Extender + // Grapheme_Base + // Grapheme_Extend + // Regional_Indicator + } +}