From 7d2faa8dc47a0ecd3c48597c4c3e296cb0f8502d Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 15:49:34 +0900 Subject: [PATCH 1/7] Import implementation for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl --- .../String/String+Encoding+Names.swift | 551 ++++++++++++++++++ 1 file changed, 551 insertions(+) create mode 100644 Sources/FoundationEssentials/String/String+Encoding+Names.swift diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift new file mode 100644 index 000000000..07ca26c21 --- /dev/null +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -0,0 +1,551 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// MARK: - Private extensions for parsing encoding names + +private extension Unicode.Scalar { + var _isASCIINumeric: Bool { + return ("0"..."9").contains(self) + } + + var _asciiNumericValue: Int { + assert(_isASCIINumeric) + return Int(self.value - 0x30) + } + + /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". + /// + /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace + var _isASCIIWhitespace: Bool { + switch self.value { + case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true + default: false + } + } +} + +private extension String { + var _trimmed: Substring.UnicodeScalarView { + let scalars = self.unicodeScalars + let isNonWhitespace: (Unicode.Scalar) -> Bool = { !$0._isASCIIWhitespace } + guard let firstIndexOfNonWhitespace = scalars.firstIndex(where: isNonWhitespace), + let lastIndexOfNonWhitespace = scalars.lastIndex(where: isNonWhitespace) else { + return Substring.UnicodeScalarView() + } + return scalars[firstIndexOfNonWhitespace...lastIndexOfNonWhitespace] + } +} + +/// A type that holds a `Unicode.Scalar` where its value is compared case-insensitively with others' +/// _if the value is within ASCII range_. +private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, + ExpressibleByUnicodeScalarLiteral { + typealias UnicodeScalarLiteralType = Unicode.Scalar.UnicodeScalarLiteralType + + let scalar: Unicode.Scalar + + @inlinable + init(_ scalar: Unicode.Scalar) { + assert(scalar.isASCII) + self.scalar = scalar + } + + init(unicodeScalarLiteral value: Unicode.Scalar.UnicodeScalarLiteralType) { + self.init(Unicode.Scalar(unicodeScalarLiteral: value)) + } + + @inlinable + static func ==( + lhs: ASCIICaseInsensitiveUnicodeScalar, + rhs: ASCIICaseInsensitiveUnicodeScalar + ) -> Bool { + if lhs.scalar == rhs.scalar { + return true + } else if ("A"..."Z").contains(lhs.scalar) { + return lhs.scalar.value + 0x20 == rhs.scalar.value + } else if ("a"..."z").contains(lhs.scalar) { + return lhs.scalar.value - 0x20 == rhs.scalar.value + } + return false + } +} + +/// A type to tokenize string for `String.Encoding` names. +private protocol StringEncodingNameTokenizer: ~Copyable { + associatedtype Token: Equatable + init(name: String) + mutating func nextToken() throws -> Token? +} + +extension StringEncodingNameTokenizer where Self: ~Copyable { + mutating func hasEqualTokens(with other: consuming Self) throws -> Bool { + while let myToken = try self.nextToken() { + guard let otherToken = try other.nextToken(), + myToken == otherToken else { + return false + } + } + return try other.nextToken() == nil + } +} + +/// ICU-independent parser that follows [Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching). +private struct UTS22Tokenizer: StringEncodingNameTokenizer, ~Copyable { + enum Token: Equatable { + case numeric(Int) + case alphabet(ASCIICaseInsensitiveUnicodeScalar) + } + + enum Error: Swift.Error { + case tooLargeNumericValue + } + + let scalars: String.UnicodeScalarView + + private var _currentIndex: String.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name.unicodeScalars + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + + let scalar = scalars[_currentIndex] + switch scalar { + case "0"..."9": + // Parse a numeric value ignoring leading zeros. + // + // NOTE: To prevent the value from overflow, a threhold is set here. + // The max number of digits to be expected is 8 as of now: i.g. `csISO42JISC62261978`. + // It wouldn't matter to throw an error in practice when the value is too large. + + let threshold: Int = 999_999_999 + var value = scalar._asciiNumericValue + scalars.formIndex(after: &_currentIndex) + while _currentIndex < scalars.endIndex { + let currentScalar = scalars[_currentIndex] + guard currentScalar._isASCIINumeric else { + break + } + value = value * 10 + currentScalar._asciiNumericValue + if value > threshold { + throw Error.tooLargeNumericValue + } + scalars.formIndex(after: &_currentIndex) + } + return .numeric(value) + case "A"..."Z", "a"..."z": + scalars.formIndex(after: &_currentIndex) + return .alphabet(ASCIICaseInsensitiveUnicodeScalar(scalar)) + default: + scalars.formIndex(after: &_currentIndex) + if _currentIndex < scalars.endIndex { + return try nextToken() + } + return nil + } + } +} + + +/// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. +private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { + typealias Token = ASCIICaseInsensitiveUnicodeScalar + + enum Error: Swift.Error { + case nonASCII + } + + let scalars: Substring.UnicodeScalarView + + var _currentIndex: Substring.UnicodeScalarView.Index + + init(name: String) { + self.scalars = name._trimmed + self._currentIndex = scalars.startIndex + } + + mutating func nextToken() throws -> Token? { + guard _currentIndex < scalars.endIndex else { + return nil + } + let scalar = scalars[_currentIndex] + guard scalar.isASCII else { throw Error.nonASCII } + defer { + scalars.formIndex(after: &_currentIndex) + } + return ASCIICaseInsensitiveUnicodeScalar(scalar) + } +} + + +private extension String { + func isEqual( + to other: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + do { + var myTokenizer = T(name: self) + let otherTokenizer = T(name: other) + return try myTokenizer.hasEqualTokens(with: otherTokenizer) + } catch { + // Any errors imply that `self` or `other` contains invalid characters. + return false + } + } +} + + +// MARK: - IANA Charset Names + +/// Info about IANA Charset. +private struct IANACharset { + /// Preferred MIME Name + let preferredMIMEName: String? + + /// The name of this charset + let name: String + + /// The aliases of this charset + let aliases: Array + + var representativeName: String { + return preferredMIMEName ?? name + } + + init(preferredMIMEName: String?, name: String, aliases: Array) { + self.preferredMIMEName = preferredMIMEName + self.name = name + self.aliases = aliases + } + + func matches( + _ string: String, + tokenizedBy tokenizer: T.Type + ) -> Bool where T: StringEncodingNameTokenizer, T: ~Copyable { + if let preferredMIMEName = self.preferredMIMEName, + preferredMIMEName.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + if name.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + for alias in aliases { + if alias.isEqual(to: string, tokenizedBy: tokenizer) { + return true + } + } + return false + } +} + +// Extracted only necessary charsets from https://www.iana.org/assignments/character-sets/character-sets.xhtml +extension IANACharset { + /// IANA Characater Set `US-ASCII` + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Characater Set `ISO-8859-1` + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Characater Set `ISO-8859-2` + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Characater Set `Shift_JIS` + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Characater Set `EUC-JP` + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Characater Set `ISO-2022-JP` + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Characater Set `UTF-8` + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Characater Set `UTF-16BE` + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Characater Set `UTF-16LE` + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Characater Set `UTF-16` + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Characater Set `UTF-32` + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Characater Set `UTF-32BE` + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Characater Set `UTF-32LE` + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Characater Set `macintosh` + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Characater Set `windows-1250` + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Characater Set `windows-1251` + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Characater Set `windows-1252` + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Characater Set `windows-1253` + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Characater Set `windows-1254` + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} + +// MARK: - `String.Encoding` Names + +extension String.Encoding { + private var _ianaCharset: IANACharset? { + switch self { + case .utf8: .utf8 + case .ascii: .usASCII + case .japaneseEUC: .eucJP + case .isoLatin1: .iso8859_1 + case .shiftJIS: .shiftJIS + case .isoLatin2: .iso8859_2 + case .unicode: .utf16 + case .windowsCP1251: .windows1251 + case .windowsCP1252: .windows1252 + case .windowsCP1253: .windows1253 + case .windowsCP1254: .windows1254 + case .windowsCP1250: .windows1250 + case .iso2022JP: .iso2022JP + case .macOSRoman: .macintosh + case .utf16BigEndian: .utf16BE + case .utf16LittleEndian: .utf16LE + case .utf32: .utf32 + case .utf32BigEndian: .utf32BE + case .utf32LittleEndian: .utf32LE + default: nil + } + } + + /// The name of this encoding that is compatible with the one of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public var ianaName: String? { + return _ianaCharset?.representativeName + } + + /// Creates an instance from the name of the IANA registry "charset". + @available(FoundationPreview 6.2, *) + public init?(ianaName charsetName: String) { + func __determineEncoding() -> String.Encoding? { + func __matches(_ charsets: IANACharset...) -> Bool { + assert(!charsets.isEmpty) + return charsets.contains { + $0.matches( + charsetName, + tokenizedBy: ASCIICaseInsensitiveTokenizer.self + ) + } + } + + return if __matches(.utf8) { + .utf8 + } else if __matches(.usASCII) { + .ascii + } else if __matches(.eucJP) { + .japaneseEUC + } else if __matches(.iso8859_1) { + .isoLatin1 + } else if __matches(.shiftJIS) { + .shiftJIS + } else if __matches(.iso8859_2) { + .isoLatin2 + } else if __matches(.utf16) { + .utf16 + } else if __matches(.windows1251) { + .windowsCP1251 + } else if __matches(.windows1252) { + .windowsCP1252 + } else if __matches(.windows1253) { + .windowsCP1253 + } else if __matches(.windows1254) { + .windowsCP1254 + } else if __matches(.windows1250) { + .windowsCP1250 + } else if __matches(.iso2022JP) { + .iso2022JP + } else if __matches(.macintosh) { + .macOSRoman + } else if __matches(.utf16BE) { + .utf16BigEndian + } else if __matches(.utf16LE) { + .utf16LittleEndian + } else if __matches(.utf32) { + .utf32 + } else if __matches(.utf32BE) { + .utf32BigEndian + } else if __matches(.utf32LE) { + .utf32LittleEndian + } else { + nil + } + } + + guard let encoding = __determineEncoding() else { + return nil + } + self = encoding + } +} + From f83add936a10224341849c34582a09ab0a102851 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Fri, 9 May 2025 16:27:55 +0900 Subject: [PATCH 2/7] Import tests for String Encoding Names from other repo. - source: https://github.com/YOCKOW/SF-StringEncodingNameImpl/blob/0.4.0/Tests/StringEncodingNameImplTests/StringEncodingNameParserTests.swift --- .../StringTests.swift | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index 26286be15..a2305ff82 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1397,6 +1397,69 @@ private struct StringTests { "abcd🎺efgh" ]) } + + func test_Encoding_names() { + // Encoding to Name + XCTAssertEqual(String._Encoding.ascii.ianaName, "US-ASCII") + XCTAssertEqual(String._Encoding.nextstep.ianaName, nil) + XCTAssertEqual(String._Encoding.japaneseEUC.ianaName, "EUC-JP") + XCTAssertEqual(String._Encoding.utf8.ianaName, "UTF-8") + XCTAssertEqual(String._Encoding.isoLatin1.ianaName, "ISO-8859-1") + XCTAssertEqual(String._Encoding.symbol.ianaName, nil) + XCTAssertEqual(String._Encoding.nonLossyASCII.ianaName, nil) + XCTAssertEqual(String._Encoding.shiftJIS.ianaName, "Shift_JIS") + XCTAssertEqual(String._Encoding.isoLatin2.ianaName, "ISO-8859-2") + XCTAssertEqual(String._Encoding.unicode.ianaName, "UTF-16") + XCTAssertEqual(String._Encoding.windowsCP1251.ianaName, "windows-1251") + XCTAssertEqual(String._Encoding.windowsCP1252.ianaName, "windows-1252") + XCTAssertEqual(String._Encoding.windowsCP1253.ianaName, "windows-1253") + XCTAssertEqual(String._Encoding.windowsCP1254.ianaName, "windows-1254") + XCTAssertEqual(String._Encoding.windowsCP1250.ianaName, "windows-1250") + XCTAssertEqual(String._Encoding.iso2022JP.ianaName, "ISO-2022-JP") + XCTAssertEqual(String._Encoding.macOSRoman.ianaName, "macintosh") + XCTAssertEqual(String._Encoding.utf16BigEndian.ianaName, "UTF-16BE") + XCTAssertEqual(String._Encoding.utf16LittleEndian.ianaName, "UTF-16LE") + XCTAssertEqual(String._Encoding.utf32.ianaName, "UTF-32") + XCTAssertEqual(String._Encoding.utf32BigEndian.ianaName, "UTF-32BE") + XCTAssertEqual(String._Encoding.utf32LittleEndian.ianaName, "UTF-32LE") + XCTAssertEqual(String._Encoding(rawValue: .max).ianaName, nil) + + // Name to Encoding + XCTAssertEqual(String._Encoding(ianaName: "us-ascii"), .ascii) + XCTAssertEqual(String._Encoding(ianaName: "iso-ir-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "x-nextstep"), nil) + XCTAssertEqual(String._Encoding(ianaName: "euc-jp"), .japaneseEUC) + XCTAssertEqual(String._Encoding(ianaName: "CP51932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-8"), .utf8) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-1"), .isoLatin1) + XCTAssertEqual(String._Encoding(ianaName: "x-mac-symbol"), nil) + XCTAssertEqual(String._Encoding(ianaName: "Adobe-symbol-encoding"), nil) + XCTAssertEqual(String._Encoding(ianaName: "cp932"), nil) + XCTAssertEqual(String._Encoding(ianaName: "shift_jis"), .shiftJIS) + XCTAssertEqual(String._Encoding(ianaName: "windows-31j"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso_8859-2"), .isoLatin2) + XCTAssertEqual(String._Encoding(ianaName: "utf-16"), .utf16) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "unicode-1-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1251"), .windowsCP1251) + XCTAssertEqual(String._Encoding(ianaName: "windows-1252"), .windowsCP1252) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1253"), .windowsCP1253) + XCTAssertEqual(String._Encoding(ianaName: "windows-1254"), .windowsCP1254) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-9-windows-Latin-5"), nil) + XCTAssertEqual(String._Encoding(ianaName: "windows-1250"), .windowsCP1250) + XCTAssertEqual(String._Encoding(ianaName: "iso-8859-2-windows-Latin-2"), nil) + XCTAssertEqual(String._Encoding(ianaName: "iso-2022-jp"), .iso2022JP) + XCTAssertEqual(String._Encoding(ianaName: "macintosh"), .macOSRoman) + XCTAssertEqual(String._Encoding(ianaName: "utf-16be"), .utf16BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-16le"), .utf16LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32"), .utf32) + XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-4"), nil) + XCTAssertEqual(String._Encoding(ianaName: "utf-32be"), .utf32BigEndian) + XCTAssertEqual(String._Encoding(ianaName: "utf-32le"), .utf32LittleEndian) + XCTAssertEqual(String._Encoding(ianaName: "foo-bar-baz"), nil) + } } // MARK: - Helper functions From ef41c4752f8f2fe02de1d6c917b0a7dddad73625 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Tue, 17 Jun 2025 10:23:47 +0900 Subject: [PATCH 3/7] Remove dead code in terms of the current proposal. --- .../String/String+Encoding+Names.swift | 71 ------------------- 1 file changed, 71 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 07ca26c21..48c1c37ce 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -14,15 +14,6 @@ // MARK: - Private extensions for parsing encoding names private extension Unicode.Scalar { - var _isASCIINumeric: Bool { - return ("0"..."9").contains(self) - } - - var _asciiNumericValue: Int { - assert(_isASCIINumeric) - return Int(self.value - 0x30) - } - /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". /// /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace @@ -99,68 +90,6 @@ extension StringEncodingNameTokenizer where Self: ~Copyable { } } -/// ICU-independent parser that follows [Charset Alias Matching](https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching). -private struct UTS22Tokenizer: StringEncodingNameTokenizer, ~Copyable { - enum Token: Equatable { - case numeric(Int) - case alphabet(ASCIICaseInsensitiveUnicodeScalar) - } - - enum Error: Swift.Error { - case tooLargeNumericValue - } - - let scalars: String.UnicodeScalarView - - private var _currentIndex: String.UnicodeScalarView.Index - - init(name: String) { - self.scalars = name.unicodeScalars - self._currentIndex = scalars.startIndex - } - - mutating func nextToken() throws -> Token? { - guard _currentIndex < scalars.endIndex else { - return nil - } - - let scalar = scalars[_currentIndex] - switch scalar { - case "0"..."9": - // Parse a numeric value ignoring leading zeros. - // - // NOTE: To prevent the value from overflow, a threhold is set here. - // The max number of digits to be expected is 8 as of now: i.g. `csISO42JISC62261978`. - // It wouldn't matter to throw an error in practice when the value is too large. - - let threshold: Int = 999_999_999 - var value = scalar._asciiNumericValue - scalars.formIndex(after: &_currentIndex) - while _currentIndex < scalars.endIndex { - let currentScalar = scalars[_currentIndex] - guard currentScalar._isASCIINumeric else { - break - } - value = value * 10 + currentScalar._asciiNumericValue - if value > threshold { - throw Error.tooLargeNumericValue - } - scalars.formIndex(after: &_currentIndex) - } - return .numeric(value) - case "A"..."Z", "a"..."z": - scalars.formIndex(after: &_currentIndex) - return .alphabet(ASCIICaseInsensitiveUnicodeScalar(scalar)) - default: - scalars.formIndex(after: &_currentIndex) - if _currentIndex < scalars.endIndex { - return try nextToken() - } - return nil - } - } -} - /// A parser that tokenizes a string into `ASCIICaseInsensitiveUnicodeScalar`s. private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { From 2ab3eefc20d7d4e89cfbf66b04634d14742d8305 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 15:07:46 +0900 Subject: [PATCH 4/7] Use `Testing` for String Encoding Names tests. --- .../StringTests.swift | 116 +++++++++--------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/Tests/FoundationEssentialsTests/StringTests.swift b/Tests/FoundationEssentialsTests/StringTests.swift index a2305ff82..6a6781874 100644 --- a/Tests/FoundationEssentialsTests/StringTests.swift +++ b/Tests/FoundationEssentialsTests/StringTests.swift @@ -1398,67 +1398,67 @@ private struct StringTests { ]) } - func test_Encoding_names() { + @Test func encodingNames() { // Encoding to Name - XCTAssertEqual(String._Encoding.ascii.ianaName, "US-ASCII") - XCTAssertEqual(String._Encoding.nextstep.ianaName, nil) - XCTAssertEqual(String._Encoding.japaneseEUC.ianaName, "EUC-JP") - XCTAssertEqual(String._Encoding.utf8.ianaName, "UTF-8") - XCTAssertEqual(String._Encoding.isoLatin1.ianaName, "ISO-8859-1") - XCTAssertEqual(String._Encoding.symbol.ianaName, nil) - XCTAssertEqual(String._Encoding.nonLossyASCII.ianaName, nil) - XCTAssertEqual(String._Encoding.shiftJIS.ianaName, "Shift_JIS") - XCTAssertEqual(String._Encoding.isoLatin2.ianaName, "ISO-8859-2") - XCTAssertEqual(String._Encoding.unicode.ianaName, "UTF-16") - XCTAssertEqual(String._Encoding.windowsCP1251.ianaName, "windows-1251") - XCTAssertEqual(String._Encoding.windowsCP1252.ianaName, "windows-1252") - XCTAssertEqual(String._Encoding.windowsCP1253.ianaName, "windows-1253") - XCTAssertEqual(String._Encoding.windowsCP1254.ianaName, "windows-1254") - XCTAssertEqual(String._Encoding.windowsCP1250.ianaName, "windows-1250") - XCTAssertEqual(String._Encoding.iso2022JP.ianaName, "ISO-2022-JP") - XCTAssertEqual(String._Encoding.macOSRoman.ianaName, "macintosh") - XCTAssertEqual(String._Encoding.utf16BigEndian.ianaName, "UTF-16BE") - XCTAssertEqual(String._Encoding.utf16LittleEndian.ianaName, "UTF-16LE") - XCTAssertEqual(String._Encoding.utf32.ianaName, "UTF-32") - XCTAssertEqual(String._Encoding.utf32BigEndian.ianaName, "UTF-32BE") - XCTAssertEqual(String._Encoding.utf32LittleEndian.ianaName, "UTF-32LE") - XCTAssertEqual(String._Encoding(rawValue: .max).ianaName, nil) + #expect(String.Encoding.ascii.ianaName == "US-ASCII") + #expect(String.Encoding.nextstep.ianaName == nil) + #expect(String.Encoding.japaneseEUC.ianaName == "EUC-JP") + #expect(String.Encoding.utf8.ianaName == "UTF-8") + #expect(String.Encoding.isoLatin1.ianaName == "ISO-8859-1") + #expect(String.Encoding.symbol.ianaName == nil) + #expect(String.Encoding.nonLossyASCII.ianaName == nil) + #expect(String.Encoding.shiftJIS.ianaName == "Shift_JIS") + #expect(String.Encoding.isoLatin2.ianaName == "ISO-8859-2") + #expect(String.Encoding.unicode.ianaName == "UTF-16") + #expect(String.Encoding.windowsCP1251.ianaName == "windows-1251") + #expect(String.Encoding.windowsCP1252.ianaName == "windows-1252") + #expect(String.Encoding.windowsCP1253.ianaName == "windows-1253") + #expect(String.Encoding.windowsCP1254.ianaName == "windows-1254") + #expect(String.Encoding.windowsCP1250.ianaName == "windows-1250") + #expect(String.Encoding.iso2022JP.ianaName == "ISO-2022-JP") + #expect(String.Encoding.macOSRoman.ianaName == "macintosh") + #expect(String.Encoding.utf16BigEndian.ianaName == "UTF-16BE") + #expect(String.Encoding.utf16LittleEndian.ianaName == "UTF-16LE") + #expect(String.Encoding.utf32.ianaName == "UTF-32") + #expect(String.Encoding.utf32BigEndian.ianaName == "UTF-32BE") + #expect(String.Encoding.utf32LittleEndian.ianaName == "UTF-32LE") + #expect(String.Encoding(rawValue: .max).ianaName == nil) // Name to Encoding - XCTAssertEqual(String._Encoding(ianaName: "us-ascii"), .ascii) - XCTAssertEqual(String._Encoding(ianaName: "iso-ir-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "x-nextstep"), nil) - XCTAssertEqual(String._Encoding(ianaName: "euc-jp"), .japaneseEUC) - XCTAssertEqual(String._Encoding(ianaName: "CP51932"), nil) - XCTAssertEqual(String._Encoding(ianaName: "utf-8"), .utf8) - XCTAssertEqual(String._Encoding(ianaName: "iso_8859-1"), .isoLatin1) - XCTAssertEqual(String._Encoding(ianaName: "x-mac-symbol"), nil) - XCTAssertEqual(String._Encoding(ianaName: "Adobe-symbol-encoding"), nil) - XCTAssertEqual(String._Encoding(ianaName: "cp932"), nil) - XCTAssertEqual(String._Encoding(ianaName: "shift_jis"), .shiftJIS) - XCTAssertEqual(String._Encoding(ianaName: "windows-31j"), nil) - XCTAssertEqual(String._Encoding(ianaName: "iso_8859-2"), .isoLatin2) - XCTAssertEqual(String._Encoding(ianaName: "utf-16"), .utf16) - XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "unicode-1-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1251"), .windowsCP1251) - XCTAssertEqual(String._Encoding(ianaName: "windows-1252"), .windowsCP1252) - XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1253"), .windowsCP1253) - XCTAssertEqual(String._Encoding(ianaName: "windows-1254"), .windowsCP1254) - XCTAssertEqual(String._Encoding(ianaName: "iso-8859-9-windows-Latin-5"), nil) - XCTAssertEqual(String._Encoding(ianaName: "windows-1250"), .windowsCP1250) - XCTAssertEqual(String._Encoding(ianaName: "iso-8859-2-windows-Latin-2"), nil) - XCTAssertEqual(String._Encoding(ianaName: "iso-2022-jp"), .iso2022JP) - XCTAssertEqual(String._Encoding(ianaName: "macintosh"), .macOSRoman) - XCTAssertEqual(String._Encoding(ianaName: "utf-16be"), .utf16BigEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-16le"), .utf16LittleEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-32"), .utf32) - XCTAssertEqual(String._Encoding(ianaName: "iso-10646-ucs-4"), nil) - XCTAssertEqual(String._Encoding(ianaName: "utf-32be"), .utf32BigEndian) - XCTAssertEqual(String._Encoding(ianaName: "utf-32le"), .utf32LittleEndian) - XCTAssertEqual(String._Encoding(ianaName: "foo-bar-baz"), nil) + #expect(String.Encoding(ianaName: "us-ascii") == .ascii) + #expect(String.Encoding(ianaName: "iso-ir-2") == nil) + #expect(String.Encoding(ianaName: "x-nextstep") == nil) + #expect(String.Encoding(ianaName: "euc-jp") == .japaneseEUC) + #expect(String.Encoding(ianaName: "CP51932") == nil) + #expect(String.Encoding(ianaName: "utf-8") == .utf8) + #expect(String.Encoding(ianaName: "iso_8859-1") == .isoLatin1) + #expect(String.Encoding(ianaName: "x-mac-symbol") == nil) + #expect(String.Encoding(ianaName: "Adobe-symbol-encoding") == nil) + #expect(String.Encoding(ianaName: "cp932") == nil) + #expect(String.Encoding(ianaName: "shift_jis") == .shiftJIS) + #expect(String.Encoding(ianaName: "windows-31j") == nil) + #expect(String.Encoding(ianaName: "iso_8859-2") == .isoLatin2) + #expect(String.Encoding(ianaName: "utf-16") == .utf16) + #expect(String.Encoding(ianaName: "iso-10646-ucs-2") == nil) + #expect(String.Encoding(ianaName: "unicode-1-1") == nil) + #expect(String.Encoding(ianaName: "windows-1251") == .windowsCP1251) + #expect(String.Encoding(ianaName: "windows-1252") == .windowsCP1252) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.0-Latin-1") == nil) + #expect(String.Encoding(ianaName: "ISO-8859-1-Windows-3.1-Latin-1") == nil) + #expect(String.Encoding(ianaName: "windows-1253") == .windowsCP1253) + #expect(String.Encoding(ianaName: "windows-1254") == .windowsCP1254) + #expect(String.Encoding(ianaName: "iso-8859-9-windows-Latin-5") == nil) + #expect(String.Encoding(ianaName: "windows-1250") == .windowsCP1250) + #expect(String.Encoding(ianaName: "iso-8859-2-windows-Latin-2") == nil) + #expect(String.Encoding(ianaName: "iso-2022-jp") == .iso2022JP) + #expect(String.Encoding(ianaName: "macintosh") == .macOSRoman) + #expect(String.Encoding(ianaName: "utf-16be") == .utf16BigEndian) + #expect(String.Encoding(ianaName: "utf-16le") == .utf16LittleEndian) + #expect(String.Encoding(ianaName: "utf-32") == .utf32) + #expect(String.Encoding(ianaName: "iso-10646-ucs-4") == nil) + #expect(String.Encoding(ianaName: "utf-32be") == .utf32BigEndian) + #expect(String.Encoding(ianaName: "utf-32le") == .utf32LittleEndian) + #expect(String.Encoding(ianaName: "foo-bar-baz") == nil) } } From bdc56a9166528edf18ef429029b08b0f1c494f35 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 15:18:15 +0900 Subject: [PATCH 5/7] NFC: Fix indentation in "String+Encoding+Names.swift". --- .../String/String+Encoding+Names.swift | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 48c1c37ce..8c5b76532 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -14,15 +14,15 @@ // MARK: - Private extensions for parsing encoding names private extension Unicode.Scalar { - /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". - /// - /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace - var _isASCIIWhitespace: Bool { - switch self.value { - case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true - default: false + /// Returns the Boolean value that indicates whether or not `self` is "ASCII whitespace". + /// + /// Reference: https://infra.spec.whatwg.org/#ascii-whitespace + var _isASCIIWhitespace: Bool { + switch self.value { + case 0x09, 0x0A, 0x0C, 0x0D, 0x20: true + default: false + } } - } } private extension String { @@ -95,9 +95,9 @@ extension StringEncodingNameTokenizer where Self: ~Copyable { private struct ASCIICaseInsensitiveTokenizer: StringEncodingNameTokenizer, ~Copyable { typealias Token = ASCIICaseInsensitiveUnicodeScalar - enum Error: Swift.Error { - case nonASCII - } + enum Error: Swift.Error { + case nonASCII + } let scalars: Substring.UnicodeScalarView From 7598ef9ef8383b5452be6808fc7d72adb8f07888 Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 21 Sep 2025 16:19:32 +0900 Subject: [PATCH 6/7] SF-0033: Adjust comments/attributes to match the accepted proposal. --- .../String/String+Encoding+Names.swift | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 8c5b76532..12f6466de 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -409,13 +409,17 @@ extension String.Encoding { } /// The name of this encoding that is compatible with the one of the IANA registry "charset". - @available(FoundationPreview 6.2, *) + @available(FoundationPreview 6.3, *) public var ianaName: String? { return _ianaCharset?.representativeName } /// Creates an instance from the name of the IANA registry "charset". - @available(FoundationPreview 6.2, *) + /// + /// - Note: The given name is compared to each IANA "charset" name + /// with ASCII case-insensitive collation + /// to determine which encoding is suitable. + @available(FoundationPreview 6.3, *) public init?(ianaName charsetName: String) { func __determineEncoding() -> String.Encoding? { func __matches(_ charsets: IANACharset...) -> Bool { From 9cd5c9acaadc771a026690ec39f48482a987360c Mon Sep 17 00:00:00 2001 From: YOCKOW Date: Sun, 12 Oct 2025 15:02:50 +0900 Subject: [PATCH 7/7] Auto-generate Swift source code for IANA Charset names. --- .../String/IANACharsetNames.swift | 213 ++++++++++++++++++ .../String/String+Encoding+Names.swift | 201 +---------------- utils/update-iana-charset-names | 62 +++++ utils/update-iana-charset-names-impl.py | 174 ++++++++++++++ 4 files changed, 451 insertions(+), 199 deletions(-) create mode 100644 Sources/FoundationEssentials/String/IANACharsetNames.swift create mode 100755 utils/update-iana-charset-names create mode 100644 utils/update-iana-charset-names-impl.py diff --git a/Sources/FoundationEssentials/String/IANACharsetNames.swift b/Sources/FoundationEssentials/String/IANACharsetNames.swift new file mode 100644 index 000000000..8f3e88f09 --- /dev/null +++ b/Sources/FoundationEssentials/String/IANACharsetNames.swift @@ -0,0 +1,213 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + + +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by `update-iana-charset-names`. + + +extension IANACharset { + /// IANA Charset `US-ASCII`. + static let usASCII = IANACharset( + preferredMIMEName: "US-ASCII", + name: "US-ASCII", + aliases: [ + "iso-ir-6", + "ANSI_X3.4-1968", + "ANSI_X3.4-1986", + "ISO_646.irv:1991", + "ISO646-US", + "US-ASCII", + "us", + "IBM367", + "cp367", + "csASCII", + ] + ) + + /// IANA Charset `ISO-8859-1`. + static let iso8859_1 = IANACharset( + preferredMIMEName: "ISO-8859-1", + name: "ISO_8859-1:1987", + aliases: [ + "iso-ir-100", + "ISO_8859-1", + "ISO-8859-1", + "latin1", + "l1", + "IBM819", + "CP819", + "csISOLatin1", + ] + ) + + /// IANA Charset `ISO-8859-2`. + static let iso8859_2 = IANACharset( + preferredMIMEName: "ISO-8859-2", + name: "ISO_8859-2:1987", + aliases: [ + "iso-ir-101", + "ISO_8859-2", + "ISO-8859-2", + "latin2", + "l2", + "csISOLatin2", + ] + ) + + /// IANA Charset `Shift_JIS`. + static let shiftJIS = IANACharset( + preferredMIMEName: "Shift_JIS", + name: "Shift_JIS", + aliases: [ + "MS_Kanji", + "csShiftJIS", + ] + ) + + /// IANA Charset `EUC-JP`. + static let eucJP = IANACharset( + preferredMIMEName: "EUC-JP", + name: "Extended_UNIX_Code_Packed_Format_for_Japanese", + aliases: [ + "csEUCPkdFmtJapanese", + "EUC-JP", + ] + ) + + /// IANA Charset `ISO-2022-JP`. + static let iso2022JP = IANACharset( + preferredMIMEName: "ISO-2022-JP", + name: "ISO-2022-JP", + aliases: [ + "csISO2022JP", + ] + ) + + /// IANA Charset `UTF-8`. + static let utf8 = IANACharset( + preferredMIMEName: nil, + name: "UTF-8", + aliases: [ + "csUTF8", + ] + ) + + /// IANA Charset `UTF-16BE`. + static let utf16BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16BE", + aliases: [ + "csUTF16BE", + ] + ) + + /// IANA Charset `UTF-16LE`. + static let utf16LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-16LE", + aliases: [ + "csUTF16LE", + ] + ) + + /// IANA Charset `UTF-16`. + static let utf16 = IANACharset( + preferredMIMEName: nil, + name: "UTF-16", + aliases: [ + "csUTF16", + ] + ) + + /// IANA Charset `UTF-32`. + static let utf32 = IANACharset( + preferredMIMEName: nil, + name: "UTF-32", + aliases: [ + "csUTF32", + ] + ) + + /// IANA Charset `UTF-32BE`. + static let utf32BE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32BE", + aliases: [ + "csUTF32BE", + ] + ) + + /// IANA Charset `UTF-32LE`. + static let utf32LE = IANACharset( + preferredMIMEName: nil, + name: "UTF-32LE", + aliases: [ + "csUTF32LE", + ] + ) + + /// IANA Charset `macintosh`. + static let macintosh = IANACharset( + preferredMIMEName: nil, + name: "macintosh", + aliases: [ + "mac", + "csMacintosh", + ] + ) + + /// IANA Charset `windows-1250`. + static let windows1250 = IANACharset( + preferredMIMEName: nil, + name: "windows-1250", + aliases: [ + "cswindows1250", + ] + ) + + /// IANA Charset `windows-1251`. + static let windows1251 = IANACharset( + preferredMIMEName: nil, + name: "windows-1251", + aliases: [ + "cswindows1251", + ] + ) + + /// IANA Charset `windows-1252`. + static let windows1252 = IANACharset( + preferredMIMEName: nil, + name: "windows-1252", + aliases: [ + "cswindows1252", + ] + ) + + /// IANA Charset `windows-1253`. + static let windows1253 = IANACharset( + preferredMIMEName: nil, + name: "windows-1253", + aliases: [ + "cswindows1253", + ] + ) + + /// IANA Charset `windows-1254`. + static let windows1254 = IANACharset( + preferredMIMEName: nil, + name: "windows-1254", + aliases: [ + "cswindows1254", + ] + ) +} diff --git a/Sources/FoundationEssentials/String/String+Encoding+Names.swift b/Sources/FoundationEssentials/String/String+Encoding+Names.swift index 12f6466de..ba2cc32ef 100644 --- a/Sources/FoundationEssentials/String/String+Encoding+Names.swift +++ b/Sources/FoundationEssentials/String/String+Encoding+Names.swift @@ -72,7 +72,7 @@ private struct ASCIICaseInsensitiveUnicodeScalar: Equatable, } /// A type to tokenize string for `String.Encoding` names. -private protocol StringEncodingNameTokenizer: ~Copyable { +internal protocol StringEncodingNameTokenizer: ~Copyable { associatedtype Token: Equatable init(name: String) mutating func nextToken() throws -> Token? @@ -142,7 +142,7 @@ private extension String { // MARK: - IANA Charset Names /// Info about IANA Charset. -private struct IANACharset { +internal struct IANACharset { /// Preferred MIME Name let preferredMIMEName: String? @@ -182,203 +182,6 @@ private struct IANACharset { } } -// Extracted only necessary charsets from https://www.iana.org/assignments/character-sets/character-sets.xhtml -extension IANACharset { - /// IANA Characater Set `US-ASCII` - static let usASCII = IANACharset( - preferredMIMEName: "US-ASCII", - name: "US-ASCII", - aliases: [ - "iso-ir-6", - "ANSI_X3.4-1968", - "ANSI_X3.4-1986", - "ISO_646.irv:1991", - "ISO646-US", - "US-ASCII", - "us", - "IBM367", - "cp367", - "csASCII", - ] - ) - - /// IANA Characater Set `ISO-8859-1` - static let iso8859_1 = IANACharset( - preferredMIMEName: "ISO-8859-1", - name: "ISO_8859-1:1987", - aliases: [ - "iso-ir-100", - "ISO_8859-1", - "ISO-8859-1", - "latin1", - "l1", - "IBM819", - "CP819", - "csISOLatin1", - ] - ) - - /// IANA Characater Set `ISO-8859-2` - static let iso8859_2 = IANACharset( - preferredMIMEName: "ISO-8859-2", - name: "ISO_8859-2:1987", - aliases: [ - "iso-ir-101", - "ISO_8859-2", - "ISO-8859-2", - "latin2", - "l2", - "csISOLatin2", - ] - ) - - /// IANA Characater Set `Shift_JIS` - static let shiftJIS = IANACharset( - preferredMIMEName: "Shift_JIS", - name: "Shift_JIS", - aliases: [ - "MS_Kanji", - "csShiftJIS", - ] - ) - - /// IANA Characater Set `EUC-JP` - static let eucJP = IANACharset( - preferredMIMEName: "EUC-JP", - name: "Extended_UNIX_Code_Packed_Format_for_Japanese", - aliases: [ - "csEUCPkdFmtJapanese", - "EUC-JP", - ] - ) - - /// IANA Characater Set `ISO-2022-JP` - static let iso2022JP = IANACharset( - preferredMIMEName: "ISO-2022-JP", - name: "ISO-2022-JP", - aliases: [ - "csISO2022JP", - ] - ) - - /// IANA Characater Set `UTF-8` - static let utf8 = IANACharset( - preferredMIMEName: nil, - name: "UTF-8", - aliases: [ - "csUTF8", - ] - ) - - /// IANA Characater Set `UTF-16BE` - static let utf16BE = IANACharset( - preferredMIMEName: nil, - name: "UTF-16BE", - aliases: [ - "csUTF16BE", - ] - ) - - /// IANA Characater Set `UTF-16LE` - static let utf16LE = IANACharset( - preferredMIMEName: nil, - name: "UTF-16LE", - aliases: [ - "csUTF16LE", - ] - ) - - /// IANA Characater Set `UTF-16` - static let utf16 = IANACharset( - preferredMIMEName: nil, - name: "UTF-16", - aliases: [ - "csUTF16", - ] - ) - - /// IANA Characater Set `UTF-32` - static let utf32 = IANACharset( - preferredMIMEName: nil, - name: "UTF-32", - aliases: [ - "csUTF32", - ] - ) - - /// IANA Characater Set `UTF-32BE` - static let utf32BE = IANACharset( - preferredMIMEName: nil, - name: "UTF-32BE", - aliases: [ - "csUTF32BE", - ] - ) - - /// IANA Characater Set `UTF-32LE` - static let utf32LE = IANACharset( - preferredMIMEName: nil, - name: "UTF-32LE", - aliases: [ - "csUTF32LE", - ] - ) - - /// IANA Characater Set `macintosh` - static let macintosh = IANACharset( - preferredMIMEName: nil, - name: "macintosh", - aliases: [ - "mac", - "csMacintosh", - ] - ) - - /// IANA Characater Set `windows-1250` - static let windows1250 = IANACharset( - preferredMIMEName: nil, - name: "windows-1250", - aliases: [ - "cswindows1250", - ] - ) - - /// IANA Characater Set `windows-1251` - static let windows1251 = IANACharset( - preferredMIMEName: nil, - name: "windows-1251", - aliases: [ - "cswindows1251", - ] - ) - - /// IANA Characater Set `windows-1252` - static let windows1252 = IANACharset( - preferredMIMEName: nil, - name: "windows-1252", - aliases: [ - "cswindows1252", - ] - ) - - /// IANA Characater Set `windows-1253` - static let windows1253 = IANACharset( - preferredMIMEName: nil, - name: "windows-1253", - aliases: [ - "cswindows1253", - ] - ) - - /// IANA Characater Set `windows-1254` - static let windows1254 = IANACharset( - preferredMIMEName: nil, - name: "windows-1254", - aliases: [ - "cswindows1254", - ] - ) -} // MARK: - `String.Encoding` Names diff --git a/utils/update-iana-charset-names b/utils/update-iana-charset-names new file mode 100755 index 000000000..23d9a2ef8 --- /dev/null +++ b/utils/update-iana-charset-names @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +##===----------------------------------------------------------------------===## +## +## This source file is part of the Swift.org open source project +## +## Copyright (c) 2025 Apple Inc. and the Swift project authors +## Licensed under Apache License v2.0 with Runtime Library Exception +## +## See https://swift.org/LICENSE.txt for license information +## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +## +##===----------------------------------------------------------------------===## + +# This is a shell script that generates a Swift source code file which contains +# the list of IANA "Character Sets". + +set -eu + +declare -r commandName="$(basename "$0")" +declare -r utilsDir="$(cd "$(dirname "$0")" && pwd)" +declare -r foundationRepoDir="$(cd "${utilsDir}/.." && pwd)" +declare -r targetSwiftFileRelativePath="Sources/FoundationEssentials/String/IANACharsetNames.swift" + +declare -r copyrightYear=$( + currentYear=$(date +%Y) + if [[ $currentYear -eq 2025 ]]; then + echo 2025 + else + echo 2025-${currentYear} + fi +) +declare -r swiftLicenseHeader=" +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) ${copyrightYear} Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// +" +declare -r warningComment=" +// WARNING: DO NOT EDIT THIS FILE DIRECTLY. +// This is auto-generated by \`${commandName}\`. + +" + +echo "Generating Swift source code..." 1>&2 +declare generatedCode +generatedCode=$( + echo "${swiftLicenseHeader##$'\n'}" + echo "$warningComment" + python3 "${utilsDir}/${commandName}-impl.py" +) + +echo "Writing the code to '${targetSwiftFileRelativePath}'..." 1>&2 +echo "$generatedCode" >"${foundationRepoDir}/${targetSwiftFileRelativePath}" + +echo "Done." 1>&2 diff --git a/utils/update-iana-charset-names-impl.py b/utils/update-iana-charset-names-impl.py new file mode 100644 index 000000000..b323cf206 --- /dev/null +++ b/utils/update-iana-charset-names-impl.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +##===----------------------------------------------------------------------===## +## +## This source file is part of the Swift.org open source project +## +## Copyright (c) 2025 Apple Inc. and the Swift project authors +## Licensed under Apache License v2.0 with Runtime Library Exception +## +## See https://swift.org/LICENSE.txt for license information +## See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +## +##===----------------------------------------------------------------------===## + +""" +This is a python script that converts an XML file containing the list of IANA +"Character Sets" to Swift source code. +This script generates minimum code and is intended to be executed by other shell +script. +""" + +import re +import urllib.request as request +import xml.etree.ElementTree as ElemTree +from typing import List, Optional + +REQUIRED_CHARSET_NAMES: List[str] = [ + "UTF-8", + "US-ASCII", + "EUC-JP", + "ISO-8859-1", + "Shift_JIS", + "ISO-8859-2", + "UTF-16", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1250", + "ISO-2022-JP", + "macintosh", + "UTF-16BE", + "UTF-16LE", + "UTF-32", + "UTF-32BE", + "UTF-32LE", +] +CHARSETS_XML_URL = "https://www.iana.org/assignments/character-sets/character-sets.xml" +CHARSETS_XML_STRING = request.urlopen(request.Request(CHARSETS_XML_URL)).read() +CHARSETS_XML_ROOT = ElemTree.fromstring(CHARSETS_XML_STRING) +CHARSETS_XML_NS = "http://www.iana.org/assignments" +CHARSETS_XML_RECORD_ELEMENTS = CHARSETS_XML_ROOT.findall( + "./{%s}registry/{%s}record" % (CHARSETS_XML_NS, CHARSETS_XML_NS) +) +SWIFT_CODE_INDENT = " " + + +class IANACharsetNameRecord: + """Representation of element in 'character-sets.xml' + + The structure of element is as blow: + + US-ASCII + + 3 + ANSI X3.4-1986 + iso-ir-6 + ANSI_X3.4-1968 + ANSI_X3.4-1986 + ISO_646.irv:1991 + ISO646-US + US-ASCII + us + IBM367 + cp367 + csASCII + US-ASCII + + """ + + def __init__(self, recordElem: ElemTree.Element): + self._name: str = recordElem.find('./{%s}name' % (CHARSETS_XML_NS)).text + self._preferredMIMEName: Optional[str] = getattr( + recordElem.find('./{%s}preferred_alias' % (CHARSETS_XML_NS)), + 'text', + None + ) + self._aliases: List[str] = list(map( + lambda aliasElem: aliasElem.text, + recordElem.findall('./{%s}alias' % (CHARSETS_XML_NS)) + )) + self._camelCasedName = None + + @property + def name(self) -> str: + return self._name + + @property + def preferredMIMEName(self) -> Optional[str]: + return self._preferredMIMEName + + @property + def representativeName(self) -> str: + return self.preferredMIMEName or self.name + + @property + def aliases(self) -> List[str]: + return self._aliases + + @property + def camelCasedName(self) -> str: + if (self._camelCasedName is not None): + return self._camelCasedName + + camelCasedName = "" + previousWord = None + for ii, word in enumerate(re.split(r"[^0-9A-Za-z]", self.representativeName)): + if previousWord is None: + camelCasedName = word.lower() + else: + if re.search(r"[0-9]$", previousWord) and re.search(r"^[0-9]", word): + camelCasedName += "_" + + if (re.fullmatch("[0-9]*[A-Z]+", word)): + camelCasedName += word + else: + camelCasedName += word.capitalize() + + previousWord = word + + self._camelCasedName = camelCasedName + return camelCasedName + + @property + def swiftCodeLines(self) -> List[str]: + def __stringLiteralOrNil(string: Optional[str]) -> str: + if (string is None): + return 'nil' + return f'"{string}"' + + lines: List[str] = [] + lines.append(f"/// IANA Charset `{self.representativeName}`.") + lines.append(f"static let {self.camelCasedName} = IANACharset(") + lines.append(f"{SWIFT_CODE_INDENT}preferredMIMEName: { + __stringLiteralOrNil(self.preferredMIMEName) + },") + lines.append(f'{SWIFT_CODE_INDENT}name: "{self.name}",') + lines.append(f"{SWIFT_CODE_INDENT}aliases: [") + for alias in self.aliases: + lines.append(f"{SWIFT_CODE_INDENT * 2}\"{alias}\",") + lines.append(f"{SWIFT_CODE_INDENT}]") + lines.append(")") + return lines + + +def generateSwiftCode() -> str: + result = "extension IANACharset {" + for record in map( + lambda recordElem: IANACharsetNameRecord(recordElem), + CHARSETS_XML_RECORD_ELEMENTS + ): + if (record.representativeName not in REQUIRED_CHARSET_NAMES): + continue + result += "\n" + result += "\n".join(map( + lambda line: SWIFT_CODE_INDENT + line, + record.swiftCodeLines + )) + result += "\n" + result += "}\n" + return result + + +if __name__ == "__main__": + print(generateSwiftCode())