From e671dfebb074b35401181ad41d3086cc7875bd29 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Fri, 9 Nov 2018 15:30:58 -0800 Subject: [PATCH 1/5] Initial commit for System.Text.Rune --- .../Resources/Strings.resx | 3 + .../System.Private.CoreLib.Shared.projitems | 3 + .../shared/System/Text/Rune.cs | 724 ++++++++++++++++++ .../shared/System/Text/UnicodeDebug.cs | 53 ++ .../shared/System/Text/UnicodeUtility.cs | 180 +++++ .../src/System/ThrowHelper.cs | 1 + 6 files changed, 964 insertions(+) create mode 100644 src/System.Private.CoreLib/shared/System/Text/Rune.cs create mode 100644 src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs create mode 100644 src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs diff --git a/src/System.Private.CoreLib/Resources/Strings.resx b/src/System.Private.CoreLib/Resources/Strings.resx index 42dcde3e2fe1..4f17a289c846 100644 --- a/src/System.Private.CoreLib/Resources/Strings.resx +++ b/src/System.Private.CoreLib/Resources/Strings.resx @@ -3652,6 +3652,9 @@ Method has been already defined. + + Cannot extract a Unicode scalar value from the specified index in the input. + Characters following the format symbol must be a number of {0} or less. diff --git a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems index 79956c4342b0..8768d19223d4 100644 --- a/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems +++ b/src/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems @@ -651,9 +651,12 @@ + + + diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs new file mode 100644 index 000000000000..7857c7f624fe --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -0,0 +1,724 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Globalization; +using System.Runtime.CompilerServices; + +namespace System.Text +{ + /// + /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive). + /// + /// + /// This type's constructors and conversion operators validate the input, so consumers can call the APIs + /// assuming that the underlying instance is well-formed. + /// + [DebuggerDisplay("{DebuggerDisplay,nq}")] + public readonly struct Rune : IComparable, IEquatable + { + private const byte IS_WHITESPACE_FLAG = 0x80; + private const byte IS_LETTER_OR_DIGIT_FLAG = 0x40; + private const byte UNICODECATEGORY_MASK = 0x1F; + + // Contains information about the ASCII character range [ U+0000..U+007F ], with: + // - 0x80 bit if set means 'is whitespace' + // - 0x40 bit if set means 'is letter or digit' + // - 0x20 bit is reserved for future use + // - bottom 5 bits are the UnicodeCategory of the character + private static ReadOnlySpan AsciiCharInfo => new byte[] + { + 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E, + 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, + 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18, + 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18, + 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12, + 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E + }; + + /// + /// Creates a from the provided UTF-16 code unit. + /// + /// + /// If represents a UTF-16 surrogate code point + /// U+D800..U+DFFF, inclusive. + /// + public Rune(char ch) + { + uint expanded = ch; + if (UnicodeUtility.IsSurrogateCodePoint(expanded)) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch); + } + ValueUnsigned = expanded; + } + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + public Rune(int value) + : this((uint)value) + { + } + + /// + /// Creates a from the provided Unicode scalar value. + /// + /// + /// If does not represent a value Unicode scalar value. + /// + [CLSCompliant(false)] + public Rune(uint value) + { + if (!UnicodeUtility.IsValidUnicodeScalar(value)) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); + } + ValueUnsigned = value; + } + + // non-validating ctor + private Rune(uint scalarValue, bool unused) + { + UnicodeDebug.AssertIsValidScalar(scalarValue); + ValueUnsigned = scalarValue; + } + + public static bool operator ==(Rune left, Rune right) => (left.ValueUnsigned == right.ValueUnsigned); + + public static bool operator !=(Rune left, Rune right) => (left.ValueUnsigned != right.ValueUnsigned); + + public static bool operator <(Rune left, Rune right) => (left.ValueUnsigned < right.ValueUnsigned); + + public static bool operator <=(Rune left, Rune right) => (left.ValueUnsigned <= right.ValueUnsigned); + + public static bool operator >(Rune left, Rune right) => (left.ValueUnsigned > right.ValueUnsigned); + + public static bool operator >=(Rune left, Rune right) => (left.ValueUnsigned >= right.ValueUnsigned); + + // Operators below are explicit because they may throw. + + public static explicit operator Rune(char ch) => new Rune(ch); + + [CLSCompliant(false)] + public static explicit operator Rune(uint value) => new Rune(value); + + public static explicit operator Rune(int value) => new Rune(value); + + // Displayed as "'' (U+XXXX)"; e.g., "'e' (U+0065)" + private string DebuggerDisplay => FormattableString.Invariant($"'{(IsValid(ValueUnsigned) ? ToString() : "\uFFFD")}' (U+{ValueUnsigned:X4})"); + + /// + /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) + /// and therefore representable by a single UTF-8 code unit. + /// + public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(ValueUnsigned); + + /// + /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) + /// and therefore representable by a single UTF-16 code unit. + /// + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(ValueUnsigned); + + /// + /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar. + /// + public int Plane => UnicodeUtility.GetPlane(ValueUnsigned); + + /// + /// A instance that represents the Unicode replacement character U+FFFD. + /// + public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); + + /// + /// Returns the length in code units () of the + /// UTF-16 sequence required to represent this scalar value. + /// + /// + /// The return value will be 1 or 2. + /// + public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(ValueUnsigned); + + /// + /// Returns the length in code units () of the + /// UTF-8 sequence required to represent this scalar value. + /// + /// + /// The return value will be 1 through 4, inclusive. + /// + public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(ValueUnsigned); + + /// + /// Returns the Unicode scalar value as an integer. + /// + public int Value => (int)ValueUnsigned; + + internal uint ValueUnsigned { get; } + + private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) + { + if (culture == null) + { + throw new ArgumentNullException(nameof(culture)); + } + + var textInfo = culture.TextInfo; + + Span original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair) + Span modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count + + int charCount = rune.EncodeToUtf16(original); + original = original.Slice(0, charCount); + modified = modified.Slice(0, charCount); + + if (toUpper) + { + textInfo.ChangeCaseToUpper(original, modified); + } + else + { + textInfo.ChangeCaseToLower(original, modified); + } + + // We use simple case folding rules, which disallows moving between the BMP and supplementary + // planes when performing a case conversion. The helper methods which reconstruct a Rune + // contain debug asserts for this condition. + + if (rune.IsBmp) + { + return UnsafeCreate(modified[0]); + } + else + { + return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1])); + } + } + + public int CompareTo(Rune other) => this.ValueUnsigned.CompareTo(other.ValueUnsigned); + + // returns the number of chars written + private int EncodeToUtf16(Span destination) + { + Debug.Assert(destination.Length >= Utf16SequenceLength, "Caller should've provided a large enough buffer."); + bool success = TryEncode(destination, out int charsWritten); + Debug.Assert(success, "TryEncode should never fail given a large enough buffer."); + return charsWritten; + } + + public override bool Equals(object obj) => (obj is Rune other) && this.Equals(other); + + public bool Equals(Rune other) => (this == other); + + public override int GetHashCode() => Value; + + /// + /// Gets the which begins at index in + /// string . + /// + /// + /// Throws if the input is null, the index is out of range, or the input does not + /// point to the beginning of a valid scalar within the string. + /// + public static Rune GetRuneAt(string input, int index) + { + if (!TryGetRuneAt(input, index, out Rune value)) + { + ThrowInvalidSurrogateException(); + } + + return value; + } + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + public static bool IsValid(int value) => IsValid((uint)value); + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + [CLSCompliant(false)] + public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); + + [StackTraceHidden] + private static void ThrowInvalidSurrogateException() + { + throw new ArgumentException( + message: SR.Argument_CannotExtractScalar, + paramName: "index"); + } + + /// + /// Returns a representation of this instance. + /// + public override string ToString() + { + Span chars = stackalloc char[2]; // worst case + return new string(chars.Slice(0, EncodeToUtf16(chars))); + } + + /// + /// Attempts to create a from the provided input value. + /// + public static bool TryCreate(char ch, out Rune result) + { + uint extendedValue = ch; + if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue)) + { + result = UnsafeCreate(extendedValue); + return true; + } + else + { + result = default; + return false; + } + } + + /// + /// Attempts to create a from the provided input value. + /// + public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result); + + /// + /// Attempts to create a from the provided input value. + /// + [CLSCompliant(false)] + public static bool TryCreate(uint value, out Rune result) + { + if (UnicodeUtility.IsValidUnicodeScalar(value)) + { + result = UnsafeCreate(value); + return true; + } + else + { + result = default; + return false; + } + } + + /// + /// Encodes this to a UTF-16 destination buffer. + /// + /// The buffer to which to write this value as UTF-16. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + /// + /// The property can be queried ahead of time to determine + /// the required size of the buffer. + /// + public bool TryEncode(Span destination, out int charsWritten) + { + if (destination.Length >= 1) + { + if (IsBmp) + { + destination[0] = (char)ValueUnsigned; + charsWritten = 1; + return true; + } + else if (destination.Length >= 2) + { + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(ValueUnsigned, out destination[0], out destination[1]); + charsWritten = 2; + return true; + } + } + + // Destination buffer not large enough + + charsWritten = default; + return false; + } + + /// + /// Encodes this to a destination buffer as UTF-8 bytes. + /// + /// The buffer to which to write this value as UTF-8. + /// + /// The number of s written to , + /// or 0 if the destination buffer is not large enough to contain the output. + /// True if the value was written to the buffer; otherwise, false. + /// + /// The property can be queried ahead of time to determine + /// the required size of the buffer. + /// + // ** This is public so it can be unit tested but isn't yet exposed via the reference assemblies. ** + public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) + { + // TODO: Optimize some of these writes by using BMI2 instructions. + + // The bit patterns below come from the Unicode Standard, Table 3-6. + + if (destination.Length >= 1) + { + if (IsAscii) + { + destination[0] = (byte)ValueUnsigned; + bytesWritten = 1; + return true; + } + + if (destination.Length >= 2) + { + if (ValueUnsigned <= 0x7FFu) + { + // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] + destination[0] = (byte)((ValueUnsigned + (0b110u << 11)) >> 6); + destination[1] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + bytesWritten = 2; + return true; + } + + if (destination.Length >= 3) + { + if (ValueUnsigned <= 0xFFFFu) + { + // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((ValueUnsigned + (0b1110 << 16)) >> 12); + destination[1] = (byte)(((ValueUnsigned & (0x3Fu << 6)) >> 6) + 0x80u); + destination[2] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + bytesWritten = 3; + return true; + } + + if (destination.Length >= 4) + { + // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] + destination[0] = (byte)((ValueUnsigned + (0b11110 << 21)) >> 18); + destination[1] = (byte)(((ValueUnsigned & (0x3Fu << 12)) >> 12) + 0x80u); + destination[2] = (byte)(((ValueUnsigned & (0x3Fu << 6)) >> 6) + 0x80u); + destination[3] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + bytesWritten = 4; + return true; + } + } + } + } + + // Destination buffer not large enough + + bytesWritten = default; + return false; + } + + /// + /// Attempts to get the which begins at index in + /// string . + /// + /// + /// Throws if the input is null or the index is out of range, but not if the input contains + /// invalid data from which a Rune cannot be extracted. + /// + public static bool TryGetRuneAt(string input, int index, out Rune value) + { + if (input == null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); + } + + if ((uint)index >= (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRange_IndexException(); + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + goto Fail; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)input.Length) + { + goto Fail; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + goto Fail; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + value = UnsafeCreate(returnValue); + return true; + + Fail: + value = default; + return false; + } + + // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without + // validation. It is the caller's responsibility to have performed manual validation + // before calling this method. If a Rune instance is forcibly constructed + // from invalid input, the APIs on this type have undefined behavior, potentially including + // introducing a security hole in the consuming application. + // + // An example of a security hole resulting from an invalid Rune value, which could result + // in a stack overflow. + // + // public int GetMarvin32HashCode(Rune r) { + // Span buffer = stackalloc char[r.Utf16SequenceLength]; + // s.TryEncode(buffer, ...); + // return Marvin32.ComputeHash(buffer.AsBytes()); + // } + + /// + /// Creates a without performing validation on the input. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); + + // These are analogs of APIs on System.Char + + public static double GetNumericValue(Rune value) + { + if (value.IsAscii) + { + uint baseNum = value.ValueUnsigned - '0'; + return (baseNum <= 9) ? (double)baseNum : -1; + } + else + { + // not an ASCII char; fall back to globalization table + return CharUnicodeInfo.InternalGetNumericValue(value.Value); + } + } + + public static UnicodeCategory GetUnicodeCategory(Rune value) + { + if (value.IsAscii) + { + return (UnicodeCategory)(AsciiCharInfo[value.Value] & UNICODECATEGORY_MASK); + } + else + { + return GetUnicodeCategoryNonAscii(value); + } + } + + private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value) + { + Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters."); + return CharUnicodeInfo.GetUnicodeCategory(value.Value); + } + + // Returns true iff this Unicode category represents a letter + private static bool IsCategoryLetter(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter); + } + + // Returns true iff this Unicode category represents a letter or a decimal digit + private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter) + || (category == UnicodeCategory.DecimalDigitNumber); + } + + // Returns true iff this Unicode category represents a number + private static bool IsCategoryNumber(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber); + } + + // Returns true iff this Unicode category represents a punctuation mark + private static bool IsCategoryPunctuation(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation); + } + + // Returns true iff this Unicode category represents a separator + private static bool IsCategorySeparator(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator); + } + + // Returns true iff this Unicode category represents a symbol + private static bool IsCategorySymbol(UnicodeCategory category) + { + return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol); + } + + public static bool IsControl(Rune value) + { + // Per the Unicode stability policy, the set of control characters + // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No + // characters will ever be added to the "control characters" group. + // See http://www.unicode.org/policies/stability_policy.html. + + // Logic below depends on Rune.Value never being -1 (since Rune is a validating type) + // 00..1F (+1) => 01..20 (&~80) => 01..20 + // 7F..9F (+1) => 80..A0 (&~80) => 00..20 + + return (((value.ValueUnsigned + 1) & ~0x80u) <= 0x20u); + } + + public static bool IsDigit(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, '0', '9'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber); + } + } + + public static bool IsLetter(Rune value) + { + if (value.IsAscii) + { + return (((value.ValueUnsigned - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] + } + else + { + return IsCategoryLetter(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsLetterOrDigit(Rune value) + { + if (value.IsAscii) + { + return ((AsciiCharInfo[value.Value] & IS_LETTER_OR_DIGIT_FLAG) != 0); + } + else + { + return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsLower(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'a', 'z'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter); + } + } + + public static bool IsNumber(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, '0', '9'); + } + else + { + return IsCategoryNumber(GetUnicodeCategoryNonAscii(value)); + } + } + + public static bool IsPunctuation(Rune value) + { + return IsCategoryPunctuation(GetUnicodeCategory(value)); + } + + public static bool IsSeparator(Rune value) + { + return IsCategorySeparator(GetUnicodeCategory(value)); + } + + public static bool IsSymbol(Rune value) + { + return IsCategorySymbol(GetUnicodeCategory(value)); + } + + public static bool IsUpper(Rune value) + { + if (value.IsAscii) + { + return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'A', 'Z'); + } + else + { + return (GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter); + } + } + + public static bool IsWhiteSpace(Rune value) + { + if (value.IsAscii) + { + return (AsciiCharInfo[value.Value] & IS_WHITESPACE_FLAG) != 0; + } + + // U+0085 is special since it's a whitespace character but is in the Control category + // instead of a normal separator category. No other code point outside the ASCII range + // has this mismatch. + + if (value.ValueUnsigned == 0x0085u) + { + return true; + } + + return IsCategorySeparator(GetUnicodeCategoryNonAscii(value)); + } + + public static Rune ToLower(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: false); + + public static Rune ToLowerInvariant(Rune value) + { + // Handle the most common case (ASCII data) first. Within the common case, we expect + // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless. + + if (value.IsAscii || GlobalizationMode.Invariant) + { + bool isUpperAlpha = UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'A', 'Z'); + return UnsafeCreate(value.ValueUnsigned + ((isUpperAlpha) ? 0x20U : 0)); + } + + // Non-ASCII data requires going through the case folding tables. + + return ToLower(value, CultureInfo.InvariantCulture); + } + + public static Rune ToUpper(Rune value, CultureInfo culture) => ChangeCase(value, culture, toUpper: true); + + public static Rune ToUpperInvariant(Rune value) + { + // Handle the most common case (ASCII data) first. Within the common case, we expect + // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless. + + if (value.IsAscii || GlobalizationMode.Invariant) + { + bool isLowerAlpha = UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'a', 'z'); + return UnsafeCreate(value.ValueUnsigned ^ ((isLowerAlpha) ? 0x20u : 0)); + } + + // Non-ASCII data requires going through the case folding tables. + + return ToUpper(value, CultureInfo.InvariantCulture); + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs new file mode 100644 index 000000000000..dedfbe2254cb --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeDebug.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.InteropServices; + +namespace System.Text +{ + internal static class UnicodeDebug + { + [Conditional("DEBUG")] + internal static void AssertIsHighSurrogateCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsHighSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 high surrogate code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsLowSurrogateCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsLowSurrogateCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid UTF-16 low surrogate code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidCodePoint(uint codePoint) + { + Debug.Assert(UnicodeUtility.IsValidCodePoint(codePoint), $"The value {ToHexString(codePoint)} is not a valid Unicode code point."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidScalar(uint scalarValue) + { + Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid Unicode scalar value."); + } + + [Conditional("DEBUG")] + internal static void AssertIsValidSupplementaryPlaneScalar(uint scalarValue) + { + Debug.Assert(UnicodeUtility.IsValidUnicodeScalar(scalarValue) && !UnicodeUtility.IsBmpCodePoint(scalarValue), $"The value {ToHexString(scalarValue)} is not a valid supplementary plane Unicode scalar value."); + } + + /// + /// Formats a code point as the hex string "U+XXXX". + /// + /// + /// The input value doesn't have to be a real code point in the Unicode codespace. It can be any integer. + /// + private static string ToHexString(uint codePoint) + { + return FormattableString.Invariant($"U+{codePoint:X4}"); + } + } +} diff --git a/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs new file mode 100644 index 000000000000..c1dcefd74bc1 --- /dev/null +++ b/src/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs @@ -0,0 +1,180 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; + +namespace System.Text +{ + internal static class UnicodeUtility + { + /// + /// The Unicode replacement character U+FFFD. + /// + public const uint ReplacementChar = 0xFFFDU; + + /// + /// Returns the Unicode plane (0 through 16, inclusive) which contains this code point. + /// + public static int GetPlane(uint codePoint) + { + UnicodeDebug.AssertIsValidCodePoint(codePoint); + + return (int)(codePoint >> 16); + } + + /// + /// Returns a Unicode scalar value from two code points representing a UTF-16 surrogate pair. + /// + public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) + { + UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); + UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); + + // This calculation comes from the Unicode specification, Table 3-5. + // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, + // then fix up the "wwww = uuuuu - 1" section of the bit distribution. The code is written as below + // to become just two instructions: shl, lea. + + return (highSurrogateCodePoint << 10) + lowSurrogateCodePoint - ((0xD800U << 10) + 0xDC00U - (1 << 16)); + } + + /// + /// Given a Unicode scalar value, gets the number of UTF-16 code units required to represent this value. + /// + public static int GetUtf16SequenceLength(uint value) + { + UnicodeDebug.AssertIsValidScalar(value); + + value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 + value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 + value >>= 24; // shift high byte down + return (int)value; // and return it + } + + /// + /// Decomposes an astral Unicode scalar into UTF-16 high and low surrogate code units. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) + { + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); + + // This calculation comes from the Unicode specification, Table 3-5. + + highSurrogateCodePoint = (char)((value + ((0xD800u - 0x40u) << 10)) >> 10); + lowSurrogateCodePoint = (char)((value & 0x3FFu) + 0xDC00u); + } + + /// + /// Given a Unicode scalar value, gets the number of UTF-8 code units required to represent this value. + /// + public static int GetUtf8SequenceLength(uint value) + { + UnicodeDebug.AssertIsValidScalar(value); + + // The logic below can handle all valid scalar values branchlessly. + // It gives generally good performance across all inputs, and on x86 + // it's only six instructions: lea, sar, xor, add, shr, lea. + + // 'a' will be -1 if input is < 0x800; else 'a' will be 0 + // => 'a' will be -1 if input is 1 or 2 UTF-8 code units; else 'a' will be 0 + + int a = ((int)value - 0x0800) >> 31; + + // The number of UTF-8 code units for a given scalar is as follows: + // - U+0000..U+007F => 1 code unit + // - U+0080..U+07FF => 2 code units + // - U+0800..U+FFFF => 3 code units + // - U+10000+ => 4 code units + // + // If we XOR the incoming scalar with 0xF800, the chart mutates: + // - U+0000..U+F7FF => 3 code units + // - U+F800..U+F87F => 1 code unit + // - U+F880..U+FFFF => 2 code units + // - U+10000+ => 4 code units + // + // Since the 1- and 3-code unit cases are now clustered, they can + // both be checked together very cheaply. + + value ^= 0xF800u; + value -= 0xF880u; // if scalar is 1 or 3 code units, high byte = 0xFF; else high byte = 0x00 + value += (4 << 24); // if scalar is 1 or 3 code units, high byte = 0x03; else high byte = 0x04 + value >>= 24; // shift high byte down + + // Final return value: + // - U+0000..U+007F => 3 + (-1) * 2 = 1 + // - U+0080..U+07FF => 4 + (-1) * 2 = 2 + // - U+0800..U+FFFF => 3 + ( 0) * 2 = 3 + // - U+10000+ => 4 + ( 0) * 2 = 4 + return (int)value + (a * 2); + } + + /// + /// Returns iff is an ASCII + /// character ([ U+0000..U+007F ]). + /// + /// + /// Per http://www.unicode.org/glossary/#ASCII, ASCII is only U+0000..U+007F. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsAsciiCodePoint(uint value) => (value <= 0x7Fu); + + /// + /// Returns iff is in the + /// Basic Multilingual Plane (BMP). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsBmpCodePoint(uint value) => (value <= 0xFFFFu); + + /// + /// Returns iff is a UTF-16 high surrogate code point, + /// i.e., is in [ U+D800..U+DBFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsHighSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDBFFU); + + /// + /// Returns iff is between + /// and , inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsInRangeInclusive(uint value, uint lowerBound, uint upperBound) => ((value - lowerBound) <= (upperBound - lowerBound)); + + /// + /// Returns iff is a UTF-16 low surrogate code point, + /// i.e., is in [ U+DC00..U+DFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsLowSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xDC00U, 0xDFFFU); + + /// + /// Returns iff is a UTF-16 surrogate code point, + /// i.e., is in [ U+D800..U+DFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsSurrogateCodePoint(uint value) => IsInRangeInclusive(value, 0xD800U, 0xDFFFU); + + /// + /// Returns iff is a valid Unicode code + /// point, i.e., is in [ U+0000..U+10FFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsValidCodePoint(uint codePoint) => (codePoint <= 0x10FFFFU); + + /// + /// Returns iff is a valid Unicode scalar + /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsValidUnicodeScalar(uint value) + { + // By XORing the incoming value with 0xD800, surrogate code points + // are moved to the range [ U+0000..U+07FF ], and all valid scalar + // values are clustered into the single range [ U+0800..U+10FFFF ], + // which allows performing a single fast range check. + + return IsInRangeInclusive(value ^ 0xD800U, 0x800U, 0x10FFFFU); + } + } +} diff --git a/src/System.Private.CoreLib/src/System/ThrowHelper.cs b/src/System.Private.CoreLib/src/System/ThrowHelper.cs index 41745e78013c..c73cd8520842 100644 --- a/src/System.Private.CoreLib/src/System/ThrowHelper.cs +++ b/src/System.Private.CoreLib/src/System/ThrowHelper.cs @@ -490,6 +490,7 @@ internal enum ExceptionArgument pHandle, values, task, + ch, s, input, pointer, From 75547366cd701215633ad09024512b31cc54ad59 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Mon, 12 Nov 2018 12:57:04 -0800 Subject: [PATCH 2/5] PR feedback --- .../shared/System/Text/Rune.cs | 106 +++++++++--------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index 7857c7f624fe..4419d08db775 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -18,9 +18,9 @@ namespace System.Text [DebuggerDisplay("{DebuggerDisplay,nq}")] public readonly struct Rune : IComparable, IEquatable { - private const byte IS_WHITESPACE_FLAG = 0x80; - private const byte IS_LETTER_OR_DIGIT_FLAG = 0x40; - private const byte UNICODECATEGORY_MASK = 0x1F; + private const byte IsWhiteSpaceFlag = 0x80; + private const byte IsLetterOrDigitFlag = 0x40; + private const byte UnicodeCategoryMask = 0x1F; // Contains information about the ASCII character range [ U+0000..U+007F ], with: // - 0x80 bit if set means 'is whitespace' @@ -39,6 +39,8 @@ namespace System.Text 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E }; + internal readonly uint UnsignedValue; + /// /// Creates a from the provided UTF-16 code unit. /// @@ -53,7 +55,7 @@ public Rune(char ch) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch); } - ValueUnsigned = expanded; + UnsignedValue = expanded; } /// @@ -80,27 +82,27 @@ public Rune(uint value) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); } - ValueUnsigned = value; + UnsignedValue = value; } // non-validating ctor private Rune(uint scalarValue, bool unused) { UnicodeDebug.AssertIsValidScalar(scalarValue); - ValueUnsigned = scalarValue; + UnsignedValue = scalarValue; } - public static bool operator ==(Rune left, Rune right) => (left.ValueUnsigned == right.ValueUnsigned); + public static bool operator ==(Rune left, Rune right) => (left.UnsignedValue == right.UnsignedValue); - public static bool operator !=(Rune left, Rune right) => (left.ValueUnsigned != right.ValueUnsigned); + public static bool operator !=(Rune left, Rune right) => (left.UnsignedValue != right.UnsignedValue); - public static bool operator <(Rune left, Rune right) => (left.ValueUnsigned < right.ValueUnsigned); + public static bool operator <(Rune left, Rune right) => (left.UnsignedValue < right.UnsignedValue); - public static bool operator <=(Rune left, Rune right) => (left.ValueUnsigned <= right.ValueUnsigned); + public static bool operator <=(Rune left, Rune right) => (left.UnsignedValue <= right.UnsignedValue); - public static bool operator >(Rune left, Rune right) => (left.ValueUnsigned > right.ValueUnsigned); + public static bool operator >(Rune left, Rune right) => (left.UnsignedValue > right.UnsignedValue); - public static bool operator >=(Rune left, Rune right) => (left.ValueUnsigned >= right.ValueUnsigned); + public static bool operator >=(Rune left, Rune right) => (left.UnsignedValue >= right.UnsignedValue); // Operators below are explicit because they may throw. @@ -112,24 +114,24 @@ private Rune(uint scalarValue, bool unused) public static explicit operator Rune(int value) => new Rune(value); // Displayed as "'' (U+XXXX)"; e.g., "'e' (U+0065)" - private string DebuggerDisplay => FormattableString.Invariant($"'{(IsValid(ValueUnsigned) ? ToString() : "\uFFFD")}' (U+{ValueUnsigned:X4})"); + private string DebuggerDisplay => FormattableString.Invariant($"U+{UnsignedValue:X4} '{(IsValid(UnsignedValue) ? ToString() : "\uFFFD")}'"); /// /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) /// and therefore representable by a single UTF-8 code unit. /// - public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(ValueUnsigned); + public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(UnsignedValue); /// /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) /// and therefore representable by a single UTF-16 code unit. /// - public bool IsBmp => UnicodeUtility.IsBmpCodePoint(ValueUnsigned); + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(UnsignedValue); /// /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar. /// - public int Plane => UnicodeUtility.GetPlane(ValueUnsigned); + public int Plane => UnicodeUtility.GetPlane(UnsignedValue); /// /// A instance that represents the Unicode replacement character U+FFFD. @@ -143,7 +145,7 @@ private Rune(uint scalarValue, bool unused) /// /// The return value will be 1 or 2. /// - public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(ValueUnsigned); + public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(UnsignedValue); /// /// Returns the length in code units () of the @@ -152,20 +154,18 @@ private Rune(uint scalarValue, bool unused) /// /// The return value will be 1 through 4, inclusive. /// - public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(ValueUnsigned); + public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(UnsignedValue); /// /// Returns the Unicode scalar value as an integer. /// - public int Value => (int)ValueUnsigned; - - internal uint ValueUnsigned { get; } + public int Value => (int)UnsignedValue; private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) { if (culture == null) { - throw new ArgumentNullException(nameof(culture)); + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture); } var textInfo = culture.TextInfo; @@ -200,7 +200,7 @@ private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) } } - public int CompareTo(Rune other) => this.ValueUnsigned.CompareTo(other.ValueUnsigned); + public int CompareTo(Rune other) => this.UnsignedValue.CompareTo(other.UnsignedValue); // returns the number of chars written private int EncodeToUtf16(Span destination) @@ -324,13 +324,13 @@ public bool TryEncode(Span destination, out int charsWritten) { if (IsBmp) { - destination[0] = (char)ValueUnsigned; + destination[0] = (char)UnsignedValue; charsWritten = 1; return true; } else if (destination.Length >= 2) { - UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(ValueUnsigned, out destination[0], out destination[1]); + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(UnsignedValue, out destination[0], out destination[1]); charsWritten = 2; return true; } @@ -365,30 +365,30 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) { if (IsAscii) { - destination[0] = (byte)ValueUnsigned; + destination[0] = (byte)UnsignedValue; bytesWritten = 1; return true; } if (destination.Length >= 2) { - if (ValueUnsigned <= 0x7FFu) + if (UnsignedValue <= 0x7FFu) { // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] - destination[0] = (byte)((ValueUnsigned + (0b110u << 11)) >> 6); - destination[1] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + destination[0] = (byte)((UnsignedValue + (0b110u << 11)) >> 6); + destination[1] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); bytesWritten = 2; return true; } if (destination.Length >= 3) { - if (ValueUnsigned <= 0xFFFFu) + if (UnsignedValue <= 0xFFFFu) { // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((ValueUnsigned + (0b1110 << 16)) >> 12); - destination[1] = (byte)(((ValueUnsigned & (0x3Fu << 6)) >> 6) + 0x80u); - destination[2] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + destination[0] = (byte)((UnsignedValue + (0b1110 << 16)) >> 12); + destination[1] = (byte)(((UnsignedValue & (0x3Fu << 6)) >> 6) + 0x80u); + destination[2] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); bytesWritten = 3; return true; } @@ -396,10 +396,10 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) if (destination.Length >= 4) { // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((ValueUnsigned + (0b11110 << 21)) >> 18); - destination[1] = (byte)(((ValueUnsigned & (0x3Fu << 12)) >> 12) + 0x80u); - destination[2] = (byte)(((ValueUnsigned & (0x3Fu << 6)) >> 6) + 0x80u); - destination[3] = (byte)((ValueUnsigned & 0x3Fu) + 0x80u); + destination[0] = (byte)((UnsignedValue + (0b11110 << 21)) >> 18); + destination[1] = (byte)(((UnsignedValue & (0x3Fu << 12)) >> 12) + 0x80u); + destination[2] = (byte)(((UnsignedValue & (0x3Fu << 6)) >> 6) + 0x80u); + destination[3] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); bytesWritten = 4; return true; } @@ -500,7 +500,7 @@ public static double GetNumericValue(Rune value) { if (value.IsAscii) { - uint baseNum = value.ValueUnsigned - '0'; + uint baseNum = value.UnsignedValue - '0'; return (baseNum <= 9) ? (double)baseNum : -1; } else @@ -514,7 +514,7 @@ public static UnicodeCategory GetUnicodeCategory(Rune value) { if (value.IsAscii) { - return (UnicodeCategory)(AsciiCharInfo[value.Value] & UNICODECATEGORY_MASK); + return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask); } else { @@ -576,14 +576,14 @@ public static bool IsControl(Rune value) // 00..1F (+1) => 01..20 (&~80) => 01..20 // 7F..9F (+1) => 80..A0 (&~80) => 00..20 - return (((value.ValueUnsigned + 1) & ~0x80u) <= 0x20u); + return (((value.UnsignedValue + 1) & ~0x80u) <= 0x20u); } public static bool IsDigit(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, '0', '9'); + return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, '0', '9'); } else { @@ -595,7 +595,7 @@ public static bool IsLetter(Rune value) { if (value.IsAscii) { - return (((value.ValueUnsigned - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] + return (((value.UnsignedValue - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] } else { @@ -607,7 +607,7 @@ public static bool IsLetterOrDigit(Rune value) { if (value.IsAscii) { - return ((AsciiCharInfo[value.Value] & IS_LETTER_OR_DIGIT_FLAG) != 0); + return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0); } else { @@ -619,7 +619,7 @@ public static bool IsLower(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'a', 'z'); + return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, 'a', 'z'); } else { @@ -631,7 +631,7 @@ public static bool IsNumber(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, '0', '9'); + return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, '0', '9'); } else { @@ -658,7 +658,7 @@ public static bool IsUpper(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'A', 'Z'); + return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, 'A', 'Z'); } else { @@ -670,14 +670,14 @@ public static bool IsWhiteSpace(Rune value) { if (value.IsAscii) { - return (AsciiCharInfo[value.Value] & IS_WHITESPACE_FLAG) != 0; + return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0; } // U+0085 is special since it's a whitespace character but is in the Control category // instead of a normal separator category. No other code point outside the ASCII range // has this mismatch. - if (value.ValueUnsigned == 0x0085u) + if (value.UnsignedValue == 0x0085u) { return true; } @@ -694,8 +694,9 @@ public static Rune ToLowerInvariant(Rune value) if (value.IsAscii || GlobalizationMode.Invariant) { - bool isUpperAlpha = UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'A', 'Z'); - return UnsafeCreate(value.ValueUnsigned + ((isUpperAlpha) ? 0x20U : 0)); + // It's ok for us to use the UTF-16 conversion utility for this since the high + // 16 bits of the value will never be set so will be left unchanged. + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value.UnsignedValue)); } // Non-ASCII data requires going through the case folding tables. @@ -712,8 +713,9 @@ public static Rune ToUpperInvariant(Rune value) if (value.IsAscii || GlobalizationMode.Invariant) { - bool isLowerAlpha = UnicodeUtility.IsInRangeInclusive(value.ValueUnsigned, 'a', 'z'); - return UnsafeCreate(value.ValueUnsigned ^ ((isLowerAlpha) ? 0x20u : 0)); + // It's ok for us to use the UTF-16 conversion utility for this since the high + // 16 bits of the value will never be set so will be left unchanged. + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value.UnsignedValue)); } // Non-ASCII data requires going through the case folding tables. From dabb9f1a51b8addae6d320a3fee530e083d63508 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Mon, 12 Nov 2018 14:13:03 -0800 Subject: [PATCH 3/5] Move GetRuneAt / TryGetRuneAt to System.String --- .../shared/System/String.cs | 83 +++++++++++++++++ .../shared/System/Text/Rune.cs | 90 +------------------ .../src/System/ThrowHelper.cs | 6 ++ 3 files changed, 91 insertions(+), 88 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/String.cs b/src/System.Private.CoreLib/shared/System/String.cs index 7050644d9aac..d8d7bc02519e 100644 --- a/src/System.Private.CoreLib/shared/System/String.cs +++ b/src/System.Private.CoreLib/shared/System/String.cs @@ -517,6 +517,89 @@ public string ToString(IFormatProvider provider) return this; } + /// + /// Gets the which begins at index in this string. + /// + /// + /// Throws if is out of range or if does not + /// reference the start of a valid scalar value within this string. + /// + public Rune GetRuneAt(int index) + { + int runeValue = ReadRuneAtIndex(index); + if (runeValue < 0) + { + ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index); + } + + return Rune.UnsafeCreate((uint)runeValue); + } + + /// + /// Attempts to get the which begins at index in this string. + /// + /// True if a scalar value was successfully extracted from the specified index, + /// false if a value could not be extracted due to invalid data. + /// + /// Throws only if is out of range. + /// + public bool TryGetRuneAt(int index, out Rune value) + { + int runeValue = ReadRuneAtIndex(index); + if (runeValue >= 0) + { + value = Rune.UnsafeCreate((uint)runeValue); + return true; + } + else + { + value = default; + return false; + } + } + + private int ReadRuneAtIndex(int index) + { + if ((uint)index >= (uint)Length) + { + ThrowHelper.ThrowArgumentOutOfRange_IndexException(); + } + + // Optimistically assume input is within BMP. + + uint returnValue = this[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = this[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + public CharEnumerator GetEnumerator() { return new CharEnumerator(this); diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index 4419d08db775..5f78f1063811 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -217,24 +217,6 @@ private int EncodeToUtf16(Span destination) public override int GetHashCode() => Value; - /// - /// Gets the which begins at index in - /// string . - /// - /// - /// Throws if the input is null, the index is out of range, or the input does not - /// point to the beginning of a valid scalar within the string. - /// - public static Rune GetRuneAt(string input, int index) - { - if (!TryGetRuneAt(input, index, out Rune value)) - { - ThrowInvalidSurrogateException(); - } - - return value; - } - /// /// Returns iff is a valid Unicode scalar /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. @@ -247,15 +229,7 @@ public static Rune GetRuneAt(string input, int index) /// [CLSCompliant(false)] public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); - - [StackTraceHidden] - private static void ThrowInvalidSurrogateException() - { - throw new ArgumentException( - message: SR.Argument_CannotExtractScalar, - paramName: "index"); - } - + /// /// Returns a representation of this instance. /// @@ -412,67 +386,7 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) bytesWritten = default; return false; } - - /// - /// Attempts to get the which begins at index in - /// string . - /// - /// - /// Throws if the input is null or the index is out of range, but not if the input contains - /// invalid data from which a Rune cannot be extracted. - /// - public static bool TryGetRuneAt(string input, int index, out Rune value) - { - if (input == null) - { - ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); - } - - if ((uint)index >= (uint)input.Length) - { - ThrowHelper.ThrowArgumentOutOfRange_IndexException(); - } - - // Optimistically assume input is within BMP. - - uint returnValue = input[index]; - if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) - { - if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) - { - goto Fail; - } - - // Treat 'returnValue' as the high surrogate. - // - // If this becomes a hot code path, we can skip the below bounds check by reading - // off the end of the string using unsafe code. Since strings are null-terminated, - // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if - // the string terminates unexpectedly. - - index++; - if ((uint)index >= (uint)input.Length) - { - goto Fail; // not an argument exception - just a "bad data" failure - } - - uint potentialLowSurrogate = input[index]; - if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) - { - goto Fail; - } - - returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); - } - - value = UnsafeCreate(returnValue); - return true; - - Fail: - value = default; - return false; - } - + // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without // validation. It is the caller's responsibility to have performed manual validation // before calling this method. If a Rune instance is forcibly constructed diff --git a/src/System.Private.CoreLib/src/System/ThrowHelper.cs b/src/System.Private.CoreLib/src/System/ThrowHelper.cs index c73cd8520842..551f87e7c55f 100644 --- a/src/System.Private.CoreLib/src/System/ThrowHelper.cs +++ b/src/System.Private.CoreLib/src/System/ThrowHelper.cs @@ -76,6 +76,11 @@ internal static void ThrowArgumentException_OverlapAlignmentMismatch() throw new ArgumentException(SR.Argument_OverlapAlignmentMismatch); } + internal static void ThrowArgumentException_CannotExtractScalar(ExceptionArgument argument) + { + throw GetArgumentException(ExceptionResource.Argument_CannotExtractScalar, argument); + } + internal static void ThrowArgumentOutOfRange_IndexException() { throw GetArgumentOutOfRangeException(ExceptionArgument.index, @@ -529,6 +534,7 @@ internal enum ExceptionResource ArgumentOutOfRange_Index, Argument_InvalidOffLen, Argument_ItemNotExist, + Argument_CannotExtractScalar, ArgumentOutOfRange_Count, ArgumentOutOfRange_InvalidThreshold, ArgumentOutOfRange_ListInsert, From d9102e9d704fe08154f44278bcfc862dee822bbe Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Mon, 12 Nov 2018 16:21:53 -0800 Subject: [PATCH 4/5] Move GetRuneAt / TryGetRuneAt back to Rune Other PR feedback --- .../shared/System/String.cs | 83 -------- .../shared/System/Text/Rune.cs | 179 +++++++++++++----- 2 files changed, 135 insertions(+), 127 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/String.cs b/src/System.Private.CoreLib/shared/System/String.cs index d8d7bc02519e..7050644d9aac 100644 --- a/src/System.Private.CoreLib/shared/System/String.cs +++ b/src/System.Private.CoreLib/shared/System/String.cs @@ -517,89 +517,6 @@ public string ToString(IFormatProvider provider) return this; } - /// - /// Gets the which begins at index in this string. - /// - /// - /// Throws if is out of range or if does not - /// reference the start of a valid scalar value within this string. - /// - public Rune GetRuneAt(int index) - { - int runeValue = ReadRuneAtIndex(index); - if (runeValue < 0) - { - ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index); - } - - return Rune.UnsafeCreate((uint)runeValue); - } - - /// - /// Attempts to get the which begins at index in this string. - /// - /// True if a scalar value was successfully extracted from the specified index, - /// false if a value could not be extracted due to invalid data. - /// - /// Throws only if is out of range. - /// - public bool TryGetRuneAt(int index, out Rune value) - { - int runeValue = ReadRuneAtIndex(index); - if (runeValue >= 0) - { - value = Rune.UnsafeCreate((uint)runeValue); - return true; - } - else - { - value = default; - return false; - } - } - - private int ReadRuneAtIndex(int index) - { - if ((uint)index >= (uint)Length) - { - ThrowHelper.ThrowArgumentOutOfRange_IndexException(); - } - - // Optimistically assume input is within BMP. - - uint returnValue = this[index]; - if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) - { - if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) - { - return -1; - } - - // Treat 'returnValue' as the high surrogate. - // - // If this becomes a hot code path, we can skip the below bounds check by reading - // off the end of the string using unsafe code. Since strings are null-terminated, - // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if - // the string terminates unexpectedly. - - index++; - if ((uint)index >= (uint)Length) - { - return -1; // not an argument exception - just a "bad data" failure - } - - uint potentialLowSurrogate = this[index]; - if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) - { - return -1; - } - - returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); - } - - return (int)returnValue; - } - public CharEnumerator GetEnumerator() { return new CharEnumerator(this); diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index 5f78f1063811..d046f48cc661 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -39,7 +39,7 @@ namespace System.Text 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E }; - internal readonly uint UnsignedValue; + private readonly uint _value; /// /// Creates a from the provided UTF-16 code unit. @@ -55,7 +55,7 @@ public Rune(char ch) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch); } - UnsignedValue = expanded; + _value = expanded; } /// @@ -82,27 +82,27 @@ public Rune(uint value) { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); } - UnsignedValue = value; + _value = value; } // non-validating ctor private Rune(uint scalarValue, bool unused) { UnicodeDebug.AssertIsValidScalar(scalarValue); - UnsignedValue = scalarValue; + _value = scalarValue; } - public static bool operator ==(Rune left, Rune right) => (left.UnsignedValue == right.UnsignedValue); + public static bool operator ==(Rune left, Rune right) => (left._value == right._value); - public static bool operator !=(Rune left, Rune right) => (left.UnsignedValue != right.UnsignedValue); + public static bool operator !=(Rune left, Rune right) => (left._value != right._value); - public static bool operator <(Rune left, Rune right) => (left.UnsignedValue < right.UnsignedValue); + public static bool operator <(Rune left, Rune right) => (left._value < right._value); - public static bool operator <=(Rune left, Rune right) => (left.UnsignedValue <= right.UnsignedValue); + public static bool operator <=(Rune left, Rune right) => (left._value <= right._value); - public static bool operator >(Rune left, Rune right) => (left.UnsignedValue > right.UnsignedValue); + public static bool operator >(Rune left, Rune right) => (left._value > right._value); - public static bool operator >=(Rune left, Rune right) => (left.UnsignedValue >= right.UnsignedValue); + public static bool operator >=(Rune left, Rune right) => (left._value >= right._value); // Operators below are explicit because they may throw. @@ -114,24 +114,24 @@ private Rune(uint scalarValue, bool unused) public static explicit operator Rune(int value) => new Rune(value); // Displayed as "'' (U+XXXX)"; e.g., "'e' (U+0065)" - private string DebuggerDisplay => FormattableString.Invariant($"U+{UnsignedValue:X4} '{(IsValid(UnsignedValue) ? ToString() : "\uFFFD")}'"); + private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'"); /// /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ]) /// and therefore representable by a single UTF-8 code unit. /// - public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(UnsignedValue); + public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value); /// /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ]) /// and therefore representable by a single UTF-16 code unit. /// - public bool IsBmp => UnicodeUtility.IsBmpCodePoint(UnsignedValue); + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); /// /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar. /// - public int Plane => UnicodeUtility.GetPlane(UnsignedValue); + public int Plane => UnicodeUtility.GetPlane(_value); /// /// A instance that represents the Unicode replacement character U+FFFD. @@ -145,7 +145,7 @@ private Rune(uint scalarValue, bool unused) /// /// The return value will be 1 or 2. /// - public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(UnsignedValue); + public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value); /// /// Returns the length in code units () of the @@ -154,12 +154,12 @@ private Rune(uint scalarValue, bool unused) /// /// The return value will be 1 through 4, inclusive. /// - public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(UnsignedValue); + public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value); /// /// Returns the Unicode scalar value as an integer. /// - public int Value => (int)UnsignedValue; + public int Value => (int)_value; private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) { @@ -200,7 +200,7 @@ private static Rune ChangeCase(Rune rune, CultureInfo culture, bool toUpper) } } - public int CompareTo(Rune other) => this.UnsignedValue.CompareTo(other.UnsignedValue); + public int CompareTo(Rune other) => this._value.CompareTo(other._value); // returns the number of chars written private int EncodeToUtf16(Span destination) @@ -217,6 +217,25 @@ private int EncodeToUtf16(Span destination) public override int GetHashCode() => Value; + /// + /// Gets the which begins at index in + /// string . + /// + /// + /// Throws if is null, if is out of range, or + /// if does not reference the start of a valid scalar value within . + /// + public static Rune GetRuneAt(string input, int index) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue < 0) + { + ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index); + } + + return UnsafeCreate((uint)runeValue); + } + /// /// Returns iff is a valid Unicode scalar /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive. @@ -229,7 +248,55 @@ private int EncodeToUtf16(Span destination) /// [CLSCompliant(false)] public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); - + + // returns a negative number on failure + private static int ReadRuneFromString(string input, int index) + { + if (input is null) + { + ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input); + } + + if ((uint)index >= (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRange_IndexException(); + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)input.Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + /// /// Returns a representation of this instance. /// @@ -298,13 +365,13 @@ public bool TryEncode(Span destination, out int charsWritten) { if (IsBmp) { - destination[0] = (char)UnsignedValue; + destination[0] = (char)_value; charsWritten = 1; return true; } else if (destination.Length >= 2) { - UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(UnsignedValue, out destination[0], out destination[1]); + UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]); charsWritten = 2; return true; } @@ -339,30 +406,30 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) { if (IsAscii) { - destination[0] = (byte)UnsignedValue; + destination[0] = (byte)_value; bytesWritten = 1; return true; } if (destination.Length >= 2) { - if (UnsignedValue <= 0x7FFu) + if (_value <= 0x7FFu) { // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ] - destination[0] = (byte)((UnsignedValue + (0b110u << 11)) >> 6); - destination[1] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); + destination[0] = (byte)((_value + (0b110u << 11)) >> 6); + destination[1] = (byte)((_value & 0x3Fu) + 0x80u); bytesWritten = 2; return true; } if (destination.Length >= 3) { - if (UnsignedValue <= 0xFFFFu) + if (_value <= 0xFFFFu) { // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((UnsignedValue + (0b1110 << 16)) >> 12); - destination[1] = (byte)(((UnsignedValue & (0x3Fu << 6)) >> 6) + 0x80u); - destination[2] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); + destination[0] = (byte)((_value + (0b1110 << 16)) >> 12); + destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[2] = (byte)((_value & 0x3Fu) + 0x80u); bytesWritten = 3; return true; } @@ -370,10 +437,10 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) if (destination.Length >= 4) { // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] - destination[0] = (byte)((UnsignedValue + (0b11110 << 21)) >> 18); - destination[1] = (byte)(((UnsignedValue & (0x3Fu << 12)) >> 12) + 0x80u); - destination[2] = (byte)(((UnsignedValue & (0x3Fu << 6)) >> 6) + 0x80u); - destination[3] = (byte)((UnsignedValue & 0x3Fu) + 0x80u); + destination[0] = (byte)((_value + (0b11110 << 21)) >> 18); + destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u); + destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u); + destination[3] = (byte)((_value & 0x3Fu) + 0x80u); bytesWritten = 4; return true; } @@ -386,7 +453,31 @@ public bool TryEncodeToUtf8Bytes(Span destination, out int bytesWritten) bytesWritten = default; return false; } - + + /// + /// Attempts to get the which begins at index in + /// string . + /// + /// if a scalar value was successfully extracted from the specified index, + /// if a value could not be extracted due to invalid data. + /// + /// Throws only if is null or is out of range. + /// + public static bool TryGetRuneAt(string input, int index, out Rune value) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue >= 0) + { + value = UnsafeCreate((uint)runeValue); + return true; + } + else + { + value = default; + return false; + } + } + // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without // validation. It is the caller's responsibility to have performed manual validation // before calling this method. If a Rune instance is forcibly constructed @@ -414,7 +505,7 @@ public static double GetNumericValue(Rune value) { if (value.IsAscii) { - uint baseNum = value.UnsignedValue - '0'; + uint baseNum = value._value - '0'; return (baseNum <= 9) ? (double)baseNum : -1; } else @@ -490,14 +581,14 @@ public static bool IsControl(Rune value) // 00..1F (+1) => 01..20 (&~80) => 01..20 // 7F..9F (+1) => 80..A0 (&~80) => 00..20 - return (((value.UnsignedValue + 1) & ~0x80u) <= 0x20u); + return (((value._value + 1) & ~0x80u) <= 0x20u); } public static bool IsDigit(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, '0', '9'); + return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9'); } else { @@ -509,7 +600,7 @@ public static bool IsLetter(Rune value) { if (value.IsAscii) { - return (((value.UnsignedValue - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] + return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z] } else { @@ -533,7 +624,7 @@ public static bool IsLower(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, 'a', 'z'); + return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z'); } else { @@ -545,7 +636,7 @@ public static bool IsNumber(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, '0', '9'); + return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9'); } else { @@ -572,7 +663,7 @@ public static bool IsUpper(Rune value) { if (value.IsAscii) { - return UnicodeUtility.IsInRangeInclusive(value.UnsignedValue, 'A', 'Z'); + return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z'); } else { @@ -591,7 +682,7 @@ public static bool IsWhiteSpace(Rune value) // instead of a normal separator category. No other code point outside the ASCII range // has this mismatch. - if (value.UnsignedValue == 0x0085u) + if (value._value == 0x0085u) { return true; } @@ -610,7 +701,7 @@ public static Rune ToLowerInvariant(Rune value) { // It's ok for us to use the UTF-16 conversion utility for this since the high // 16 bits of the value will never be set so will be left unchanged. - return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value.UnsignedValue)); + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value)); } // Non-ASCII data requires going through the case folding tables. @@ -629,7 +720,7 @@ public static Rune ToUpperInvariant(Rune value) { // It's ok for us to use the UTF-16 conversion utility for this since the high // 16 bits of the value will never be set so will be left unchanged. - return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value.UnsignedValue)); + return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value)); } // Non-ASCII data requires going through the case folding tables. From 9714422e7211f9175eaf1b33e90a2e2e6f86499e Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Tue, 13 Nov 2018 15:26:43 -0800 Subject: [PATCH 5/5] Doc comment fixup --- src/System.Private.CoreLib/shared/System/Text/Rune.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Text/Rune.cs b/src/System.Private.CoreLib/shared/System/Text/Rune.cs index d046f48cc661..a4ef3a37b731 100644 --- a/src/System.Private.CoreLib/shared/System/Text/Rune.cs +++ b/src/System.Private.CoreLib/shared/System/Text/Rune.cs @@ -62,7 +62,7 @@ public Rune(char ch) /// Creates a from the provided Unicode scalar value. /// /// - /// If does not represent a value Unicode scalar value. + /// If does not represent a value Unicode scalar value. /// public Rune(int value) : this((uint)value) @@ -73,7 +73,7 @@ public Rune(int value) /// Creates a from the provided Unicode scalar value. /// /// - /// If does not represent a value Unicode scalar value. + /// If does not represent a value Unicode scalar value. /// [CLSCompliant(false)] public Rune(uint value) @@ -489,7 +489,7 @@ public static bool TryGetRuneAt(string input, int index, out Rune value) // // public int GetMarvin32HashCode(Rune r) { // Span buffer = stackalloc char[r.Utf16SequenceLength]; - // s.TryEncode(buffer, ...); + // r.TryEncode(buffer, ...); // return Marvin32.ComputeHash(buffer.AsBytes()); // }