diff --git a/base/char.jl b/base/char.jl index ea7334eb0679e..600c3f6272d55 100644 --- a/base/char.jl +++ b/base/char.jl @@ -1,8 +1,58 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -convert(::Type{Char}, x::UInt32) = reinterpret(Char, x) +struct MalformedCharError <: Exception + char::Char +end +struct CodePointError <: Exception + code::Integer +end +@noinline malformed_char(c::Char) = throw(MalformedCharError(c)) +@noinline code_point_err(u::UInt32) = throw(CodePointError(u)) + +function ismalformed(c::Char) + u = reinterpret(UInt32, c) + l1 = leading_ones(u) << 3 + t0 = trailing_zeros(u) & 56 + (l1 == 8) | (l1 + t0 > 32) | + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) +end + +function convert(::Type{UInt32}, c::Char) + # TODO: use optimized inline LLVM + u = reinterpret(UInt32, c) + u < 0x80000000 && return reinterpret(UInt32, u >> 24) + l1 = leading_ones(u) + t0 = trailing_zeros(u) & 56 + (l1 == 1) | (8l1 + t0 > 32) | + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) && + malformed_char(c)::Union{} + u &= 0xffffffff >> l1 + u >>= t0 + (u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) | + (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6) +end + +function convert(::Type{Char}, u::UInt32) + u < 0x80 && return reinterpret(Char, u << 24) + u < 0x00200000 || code_point_err(u)::Union{} + c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) | + ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000) + c = u < 0x00000800 ? (c << 16) | 0xc0800000 : + u < 0x00010000 ? (c << 08) | 0xe0808000 : + (c << 00) | 0xf0808080 + reinterpret(Char, c) +end + +function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8} + i = reinterpret(Int32, c) + i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c)) +end + +function convert(::Type{Char}, b::Union{Int8,UInt8}) + 0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b)) +end + convert(::Type{Char}, x::Number) = Char(UInt32(x)) -convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x) convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x)) rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T) @@ -29,11 +79,9 @@ done(c::Char, state) = state isempty(c::Char) = false in(x::Char, y::Char) = x == y -==(x::Char, y::Char) = UInt32(x) == UInt32(y) -isless(x::Char, y::Char) = UInt32(x) < UInt32(y) - -const hashchar_seed = 0xd4d64234 -hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h)) +==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y) +isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y) +hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h)) -(x::Char, y::Char) = Int(x) - Int(y) -(x::Char, y::Integer) = Char(Int32(x) - Int32(y)) @@ -66,7 +114,7 @@ function show(io::IO, c::Char) end if isprint(c) write(io, 0x27, c, 0x27) - else + elseif !ismalformed(c) u = UInt32(c) write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55) d = max(2, 8 - (leading_zeros(u) >> 2)) @@ -74,13 +122,29 @@ function show(io::IO, c::Char) write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1]) end write(io, 0x27) + else # malformed + write(io, 0x27) + u = reinterpret(UInt32, c) + while true + a = hex_chars[((u >> 28) & 0xf) + 1] + b = hex_chars[((u >> 24) & 0xf) + 1] + write(io, 0x5c, 'x', a, b) + (u <<= 8) == 0 && break + end + write(io, 0x27) end return end function show(io::IO, ::MIME"text/plain", c::Char) show(io, c) - u = UInt32(c) - print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) - print(io, " (category ", UTF8proc.category_abbrev(c), ": ", UTF8proc.category_string(c), ")") + if !ismalformed(c) + u = UInt32(c) + print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) + else + print(io, ": Malformed UTF-8") + end + abr = UTF8proc.category_abbrev(c) + str = UTF8proc.category_string(c) + print(io, " (category ", abr, ": ", str, ")") end diff --git a/base/filesystem.jl b/base/filesystem.jl index c5f8e4b10854d..6268d1d420752 100644 --- a/base/filesystem.jl +++ b/base/filesystem.jl @@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8}) return ret % UInt8 end +function read(f::File, ::Type{Char}) + b0 = read(f, UInt8) + l = 8(4-leading_ones(b0)) + c = UInt32(b0) << 24 + if l < 24 + s = 16 + while s ≥ l && !eof(f) + p = position(f) + b = read(f, UInt8) + if b & 0xc0 != 0x80 + seek(f, p) + break + end + c |= UInt32(b) << s + s -= 8 + end + end + return reinterpret(Char, c) +end + function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt) check_open(f) ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t), diff --git a/base/intfuncs.jl b/base/intfuncs.jl index abc1fd95b3e6a..76b45f90cf4e8 100644 --- a/base/intfuncs.jl +++ b/base/intfuncs.jl @@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex) @eval begin ($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false) ($sym)(x::Unsigned) = ($sym)(x,1,false) - ($sym)(x::Char, p::Int) = ($sym)(unsigned(x),p,false) - ($sym)(x::Char) = ($sym)(unsigned(x),1,false) + ($sym)(x::Char, p::Int) = ($sym)(UInt32(x),p,false) + ($sym)(x::Char) = ($sym)(UInt32(x),1,false) ($sym)(x::Integer, p::Int) = ($sym)(unsigned(abs(x)),p,x<0) ($sym)(x::Integer) = ($sym)(unsigned(abs(x)),1,x<0) end diff --git a/base/io.jl b/base/io.jl index 4d7f745b126e5..030cd9c5698e0 100644 --- a/base/io.jl +++ b/base/io.jl @@ -432,25 +432,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N} end end - -function write(s::IO, ch::Char) - c = reinterpret(UInt32, ch) - if c < 0x80 - return write(s, c%UInt8) - elseif c < 0x800 - return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - elseif c < 0x10000 - return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) + - (write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - elseif c < 0x110000 - return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) + - (write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) + - (write(s, (( c & 0x3F ) | 0x80)%UInt8)) - else - return write(s, '\ufffd') +function write(io::IO, c::Char) + u = bswap(reinterpret(UInt32, c)) + n = 1 + while true + write(io, u % UInt8) + (u >>= 8) == 0 && return n + n += 1 end end @@ -493,23 +481,20 @@ function read!(s::IO, a::Array{T}) where T return a end -function read(s::IO, ::Type{Char}) - ch = read(s, UInt8) - if ch < 0x80 - return Char(ch) - end - - # mimic utf8.next function - trailing = Base.utf8_trailing[ch+1] - c::UInt32 = 0 - for j = 1:trailing - c += ch - c <<= 6 - ch = read(s, UInt8) +function read(io::IO, ::Type{Char}) + b0 = read(io, UInt8) + l = 8(4-leading_ones(b0)) + c = UInt32(b0) << 24 + if l < 24 + s = 16 + while s ≥ l && !eof(io) + peek(io) & 0xc0 == 0x80 || break + b = read(io, UInt8) + c |= UInt32(b) << s + s -= 8 + end end - c += ch - c -= Base.utf8_offset[trailing+1] - return Char(c) + return reinterpret(Char, c) end # readuntil_string is useful below since it has @@ -517,7 +502,7 @@ end readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim)) function readuntil(s::IO, delim::Char) - if delim < Char(0x80) + if delim ≤ '\x7f' return readuntil_string(s, delim % UInt8) end out = IOBuffer() @@ -598,7 +583,7 @@ function readuntil(io::IO, target::AbstractString) i = start(target) done(target, i) && return "" c, i = next(target, start(target)) - if done(target, i) && c < Char(0x80) + if done(target, i) && c <= '\x7f' return readuntil_string(io, c % UInt8) end # decide how we can index target @@ -625,14 +610,13 @@ function readuntil(io::IO, target::AbstractVector{T}) where T return out end - """ readchomp(x) -Read the entirety of `x` as a string and remove a single trailing newline. -Equivalent to `chomp!(read(x, String))`. +Read the entirety of `x` as a string and remove a single trailing newline +if there is one. Equivalent to `chomp(read(x, String))`. """ -readchomp(x) = chomp!(read(x, String)) +readchomp(x) = chomp(read(x, String)) # read up to nb bytes into nb, returning # bytes read diff --git a/base/iostream.jl b/base/iostream.jl index 117bf77e7f8a6..347b86ca10f34 100644 --- a/base/iostream.jl +++ b/base/iostream.jl @@ -315,12 +315,13 @@ end ## low-level calls ## -write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios)) +function write(s::IOStream, b::UInt8) + iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable")) + Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios)) +end function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt) - if !iswritable(s) - throw(ArgumentError("write failed, IOStream is not writeable")) - end + iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable")) return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb)) end @@ -353,14 +354,6 @@ end ## text I/O ## -function write(s::IOStream, c::Char) - if !iswritable(s) - throw(ArgumentError("write failed, IOStream is not writeable")) - end - Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c)) -end -read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios)) - take!(s::IOStream) = ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios) @@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true) end ## Character streams ## -const _chtmp = Ref{Char}() + function peekchar(s::IOStream) - if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0 + chref = Ref{UInt32}() + if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0 return typemax(Char) end - return _chtmp[] + return Char(chref[]) end function peek(s::IOStream) ccall(:ios_peekc, Cint, (Ptr{Void},), s) end + +function peek(s::IO) + mark(s) + try read(s, UInt8) + finally + reset(s) + end +end diff --git a/base/parse.jl b/base/parse.jl index 87447ba0a0a90..086cf86e46515 100644 --- a/base/parse.jl +++ b/base/parse.jl @@ -224,12 +224,12 @@ end ## string to float functions ## tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) +tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits) tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1) tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1) tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s)) -tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof) +tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits) tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1) tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1) diff --git a/base/regex.jl b/base/regex.jl index 344730007d7ec..0d2ecc935297c 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -303,8 +303,12 @@ struct SubstitutionString{T<:AbstractString} <: AbstractString string::T end -endof(s::SubstitutionString) = endof(s.string) -next(s::SubstitutionString, idx::Int) = next(s.string, idx) +ncodeunits(s::SubstitutionString) = ncodeunits(s.string) +codeunit(s::SubstitutionString) = codeunit(s.string) +codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i) +isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i) +next(s::SubstitutionString, i::Integer) = next(s.string, i) + function show(io::IO, s::SubstitutionString) print(io, "s") show(io, s.string) diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl index 3e5056d613f26..2c4ba328093fa 100644 --- a/base/repl/REPLCompletions.jl +++ b/base/repl/REPLCompletions.jl @@ -106,7 +106,7 @@ const sorted_keywords = [ "primitive type", "quote", "return", "struct", "true", "try", "using", "while"] -function complete_keyword(s::String) +function complete_keyword(s::Union{String,SubString{String}}) r = searchsorted(sorted_keywords, s) i = first(r) n = length(sorted_keywords) diff --git a/base/stream.jl b/base/stream.jl index 4cf2d753f67ef..ab06e16f64913 100644 --- a/base/stream.jl +++ b/base/stream.jl @@ -1148,6 +1148,14 @@ unmark(x::LibuvStream) = unmark(x.buffer) reset(x::LibuvStream) = reset(x.buffer) ismarked(x::LibuvStream) = ismarked(x.buffer) +function peek(s::LibuvStream) + mark(s) + try read(s, UInt8) + finally + reset(s) + end +end + # BufferStream's are non-OS streams, backed by a regular IOBuffer mutable struct BufferStream <: LibuvStream buffer::IOBuffer diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 734f1cc6f9041..2d21a7ad5d609 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -1,57 +1,188 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -## core string functions ## +""" +The `AbstractString` type is the supertype of all string implementations in +Julia. Strings are encodings of sequences of [Unicode](https://unicode.org/) +code points as represented by the `Char` type. Julia makes a few assumptions +about strings: -endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")") -next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)") -next(s::AbstractString, i::Integer) = next(s,Int(i)) +* Strings are encoded in terms of fixed-size "code units" + * Code units can be extracted with `codeunit(s, i)` + * The first code unit has index `1` + * The last code unit has index `ncodeunits(s)` + * Any index `i` such that `1 ≤ i ≤ ncodeunits(s)` is in bounds +* String indexing is done in terms of these code units: + * Characters are extracted by `s[i]` with a valid string index `i` + * Each `Char` in a string is encoded by one or more code units + * Only the index of the first code unit of a `Char` is a valid index + * The encoding of a `Char` is independent of what precedes or follows it + * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1) -string() = "" -string(s::AbstractString) = s +Some string functions error if you use an out-of-bounds or invalid string index, +including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and +string iteration `next(s,i)`. Other string functions take a more relaxed +approach to indexing and give you the closest valid string index when in-bounds, +or when out-of-bounds, behave as if there were an infinite number of characters +padding each side of the string. Usually these imaginary padding characters have +code unit length `1`, but string types may choose different sizes. Relaxed +indexing functions include those intended for index arithmetic: `thisind`, +`nextind` and `prevind`. This model allows index arithmetic to work with out-of- +bounds indices as intermediate values so long as one never uses them to retrieve +a character, which often helps avoid needing to code around edge cases. -(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) -(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) -(::Type{Vector{Char}})(s::AbstractString) = collect(s) +See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind` +""" +AbstractString -Symbol(s::AbstractString) = Symbol(String(s)) +## required string functions ## -# string types are convertible -convert(::Type{T}, s::T) where {T<:AbstractString} = s -convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s) +""" + ncodeunits(s::AbstractString) -> Int -## generic supplied functions ## +Return the number of code units in a string. Indices that are in bounds to +access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indices +are valid – they may not be the start of a character, but they will return a +code unit value when calling `codeunit(s,i)`. -start(s::AbstractString) = 1 -done(s::AbstractString,i) = (i > endof(s)) -getindex(s::AbstractString, i::Int) = next(s,i)[1] -getindex(s::AbstractString, i::Integer) = s[Int(i)] -getindex(s::AbstractString, i::Colon) = s -getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] -# TODO: handle other ranges with stride ±1 specially? -getindex(s::AbstractString, v::AbstractVector{<:Integer}) = - sprint(length(v), io->(for i in v; write(io,s[i]) end)) -getindex(s::AbstractString, v::AbstractVector{Bool}) = - throw(ArgumentError("logical indexing not supported for strings")) +See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof` +""" +ncodeunits(s::AbstractString) -get(s::AbstractString, i::Integer, default) = isvalid(s,i) ? s[i] : default +""" + codeunit(s::AbstractString) -> Type{<:Union{UInt8, UInt16, UInt32}} + +Return the code unit type of the given string object. For ASCII, Latin-1, or +UTF-8 encoded strings, this would be `UInt8`; for UCS-2 and UTF-16 it would be +`UInt16`; for UTF-32 it would be `UInt32`. The unit code type need not be +limited to these three types, but it's hard to think of widely used string +encodings that don't use one of these units. `codeunit(s)` is the same as +`typeof(codeunit(s,1))` when `s` is a non-empty string. +See also: `ncodeunits` """ - sizeof(s::AbstractString) +codeunit(s::AbstractString) + +""" + codeunit(s::AbstractString, i::Integer) -> Union{UInt8, UInt16, UInt32} + +Return the code unit value in the string `s` at index `i`. Note that -The number of bytes in string `s`. + codeunit(s, i) :: codeunit(s) + +I.e. the value returned by `codeunit(s, i)` is of the type returned by +`codeunit(s)`. + +See also: `ncodeunits`, `checkbounds` +""" +codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(codeunit, Tuple{typeof(s),Int})) : + codeunit(s, Int(i)) + +""" + isvalid(s::AbstractString, i::Integer) -> Bool + +Predicate indicating whether the given index is the start of the encoding of +a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return +the character whose encoding starts at that index, if it's false, then `s[i]` +will raise an invalid index error. Behavior of `next(s, i)` is similar except +that the character is returned along with the index of the following character. +In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must +be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code); +this is a basic assumption of Julia's generic string support. + +See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length` # Examples + ```jldoctest -julia> sizeof("❤") -3 +julia> str = "αβγdef"; + +julia> isvalid(str, 1) +true + +julia> str[1] +'α': Unicode U+03b1 (category Ll: Letter, lowercase) + +julia> isvalid(str, 2) +false + +julia> str[2] +ERROR: UnicodeError: invalid character index +Stacktrace: +[...] ``` """ -sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation") +isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(isvalid, Tuple{typeof(s),Int})) : + isvalid(s, Int(i)) + +""" + next(s::AbstractString, i::Integer) -> Tuple{Char, Int} + +Return a tuple of the character in `s` at index `i` with the index of the start +of the following character in `s`. This is the key method that allows strings to +be iterated, yielding a sequences of characters. If `i` is out of bounds in `s` +then a bounds error is raised; if `i` is not a valid character index in `s` then +a Unicode index error is raised. + +See also: `getindex`, `start`, `done`, `checkbounds` +""" +next(s::AbstractString, i::Integer) = typeof(i) === Int ? + throw(MethodError(next, Tuple{typeof(s),Int})) : + next(s, Int(i)) + +## basic generic definitions ## +start(s::AbstractString) = 1 +done(s::AbstractString, i::Integer) = i > ncodeunits(s) eltype(::Type{<:AbstractString}) = Char +sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s)) +endof(s::AbstractString) = thisind(s, ncodeunits(s)) + +getindex(s::AbstractString, i::Integer) = next(s, i)[1] +getindex(s::AbstractString, i::Colon) = s +# TODO: handle other ranges with stride ±1 specially? +getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r) +getindex(s::AbstractString, v::AbstractVector{<:Integer}) = + sprint(length(v), io->(for i in v; write(io, s[i]) end)) +getindex(s::AbstractString, v::AbstractVector{Bool}) = + throw(ArgumentError("logical indexing not supported for strings")) + +get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default + +## bounds checking ## + +checkbounds(::Type{Bool}, s::AbstractString, i::Integer) = + 1 ≤ i ≤ ncodeunits(s) +checkbounds(::Type{Bool}, s::AbstractString, r::AbstractRange{<:Integer}) = + isempty(r) || (1 ≤ minimum(r) && maximum(r) ≤ ncodeunits(s)) +checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Real}) = + all(i -> checkbounds(s, i), I) +checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Integer}) = + all(i -> checkbounds(s, i), I) +checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) = + checkbounds(Bool, s, I) || throw(BoundsError(s, I)) + +## construction, conversion, promotion ## + +string() = "" +string(s::AbstractString) = s + +(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s)) +(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s) +(::Type{Vector{Char}})(s::AbstractString) = collect(s) + +Symbol(s::AbstractString) = Symbol(String(s)) + +convert(::Type{T}, s::T) where {T<:AbstractString} = s +convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s) + +promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String + +## string & character concatenation ## """ - *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) + *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent to calling the [`string`](@ref) function on the arguments. @@ -69,49 +200,16 @@ julia> 'j' * "ulia" one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "") -# generic number of code units; implementations generally know how long a string -# is though and should override this with a more efficient method -ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1 +## generic string comparison ## """ - length(s::AbstractString) + cmp(a::AbstractString, b::AbstractString) -> Int -The number of characters in string `s`. - -# Examples -```jldoctest -julia> length("jμΛIα") -5 -``` -""" -function length(s::AbstractString) - i = start(s) - if done(s,i) - return 0 - end - n = 1 - while true - c, j = next(s,i) - if done(s,j) - return n - end - n += 1 - i = j - end -end - -## string comparison functions ## -""" - cmp(a::AbstractString, b::AbstractString) - -Compare two strings for equality. - -Return `0` if both strings have the same length and the character -at each index is the same in both strings. -Return `-1` if `a` is a substring of `b`, or if `a` comes before -`b` in alphabetical order. -Return `1` if `b` is a substring of `a`, or if `b` comes before -`a` in alphabetical order. +Compare two strings for equality. Return `0` if both strings have the same +length and the character at each index is the same in both strings. Return `-1` +if `a` is a substring of `b`, or if `a` comes before `b` in alphabetical order. +Return `1` if `b` is a substring of `a`, or if `b` comes before `a` in +alphabetical order (technically, lexicographical order by Unicode code points). # Examples ```jldoctest @@ -138,28 +236,23 @@ julia> cmp("b", "β") ``` """ function cmp(a::AbstractString, b::AbstractString) - if a === b - return 0 - end + a === b && return 0 i = start(a) j = start(b) - while !done(a,i) - if done(b,j) - return +1 - end - c, i = next(a,i) - d, j = next(b,j) - if c != d - return c < d ? -1 : +1 - end + while !done(a, i) + done(b, j) && return 1 + c, i = next(a, i) + d, j = next(b, j) + c ≠ d && return ifelse(c < d, -1, 1) end - done(b,j) ? 0 : -1 + return ifelse(done(b, j), 0, -1) end """ - ==(a::AbstractString, b::AbstractString) + ==(a::AbstractString, b::AbstractString) -> Bool -Test whether two strings are equal character by character. +Test whether two strings are equal character by character (technically, Unicode +code point by code point). # Examples ```jldoctest @@ -170,12 +263,13 @@ julia> "abc" == "αβγ" false ``` """ -==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0 +==(a::AbstractString, b::AbstractString) = cmp(a, b) == 0 """ - isless(a::AbstractString, b::AbstractString) + isless(a::AbstractString, b::AbstractString) -> Bool -Test whether string `a` comes before string `b` in alphabetical order. +Test whether string `a` comes before string `b` in alphabetical order +(technically, in lexicographical order by Unicode code points). # Examples ```jldoctest @@ -189,64 +283,58 @@ julia> isless("a", "a") false ``` """ -isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0 +isless(a::AbstractString, b::AbstractString) = cmp(a, b) < 0 # faster comparisons for symbols cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b))) -isless(a::Symbol, b::Symbol) = cmp(a,b) < 0 +isless(a::Symbol, b::Symbol) = cmp(a, b) < 0 -## Generic validation functions ## +## character index arithmetic ## """ - isvalid(str::AbstractString, i::Integer) + length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer + +The number of characters in string `s` from indices `lo` through `hi`. This is +computed as the number of code unit indices from `lo` to `hi` which are valid +character indices. Without only a single string argument, this computes the +number of characters in the entire string. If `lo` or `hi` are out of ranges +each out of range code unit is considered to be one character. This matches the +"loose" indexing model of `thisind`, `nextind` and `prevind`. -Tell whether index `i` is valid for the given string. +See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind` # Examples ```jldoctest -julia> str = "αβγdef"; - -julia> isvalid(str, 1) -true - -julia> str[1] -'α': Unicode U+03b1 (category Ll: Letter, lowercase) - -julia> isvalid(str, 2) -false - -julia> str[2] -ERROR: UnicodeError: invalid character index -Stacktrace: -[...] +julia> length("jμΛIα") +5 ``` """ -function isvalid(s::AbstractString, i::Integer) - i < 1 && return false - done(s,i) && return false - try - next(s,i) - true - catch - false +function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) + z = ncodeunits(s) + a = Int(max(1, min(z, lo))) + b = Int(min(z, max(1, hi))) + n = a - b + for i = a:b + n += isvalid(s, i) end + return n + hi - lo end -## Generic indexing functions ## - """ - thisind(s::AbstractString, i::Integer) + thisind(s::AbstractString, i::Integer) -> Int -If `i` is the index into a character in `s` then `thisind` returns the index of the -start of that character. If `i < start(s)` then it returns `start(s) - 1`. -If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding code unit `i` is part of. In other words, if `i` is the start of a +character, return `i`; if `i` is not the start of a character, rewind until the +start of a character and return that index. If `i` is out of bounds in `s` +return `i`. # Examples ```jldoctest julia> thisind("αβγdef", -5) -0 +-5 julia> thisind("αβγdef", 1) 1 @@ -264,23 +352,24 @@ julia> thisind("αβγdef", 10) 10 julia> thisind("αβγdef", 20) -10 +20 """ function thisind(s::AbstractString, i::Integer) - j = Int(i) - isvalid(s, j) && return j - j < start(s) && return 0 - n = ncodeunits(s) - j > n && return n + 1 - prevind(s, j) + i ≤ ncodeunits(s) || return i + @inbounds while 1 < i && !isvalid(s, i) + i -= 1 + end + return i end """ - prevind(str::AbstractString, i::Integer, nchar::Integer=1) + prevind(str::AbstractString, i::Integer, n::Integer=1) -> Int -Get the previous valid string index before `i`. -Returns a value less than `1` at the beginning of the string. -If the `nchar` argument is given the function goes back `nchar` characters. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding starts before index `i`. In other words, if `i` is the start of a +character, return the start of the previous character; if `i` is not the start +of a character, rewind until the start of a character and return that index. +If `i` is out of bounds in `s` return `i - 1`. If `n == 0` return `i`. # Examples ```jldoctest @@ -290,51 +379,32 @@ julia> prevind("αβγdef", 3) julia> prevind("αβγdef", 1) 0 +julia> prevind("αβγdef", 0) +-1 + julia> prevind("αβγdef", 3, 2) 0 ``` """ -function prevind(s::AbstractString, i::Integer) - e = endof(s) - if i > e - return e +function prevind(s::AbstractString, i::Integer, n::Integer=1) + n < 0 && throw(ArgumentError("n cannot be negative: $n")) + z = ncodeunits(s) + 1 + if i > z + n -= i - z + i = z end - j = Int(i)-1 - while j >= 1 - if isvalid(s,j) - return j - end - j -= 1 - end - return 0 # out of range -end - -function prevind(s::AbstractString, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - e = endof(s) - j = Int(i) - j < 1 && return 0 - while nchar > 0 - if j > e - j = e - else - j -= 1 - while j >= 1 && !isvalid(s,j) - j -= 1 - end - end - j < 1 && return 0 - nchar -= 1 + while n > 0 && 1 < i + @inbounds n -= isvalid(s, i -= 1) end - j + return i - n end """ - nextind(str::AbstractString, i::Integer, nchar::Integer=1) + nextind(str::AbstractString, i::Integer, n::Integer=1) -> Int -Get the next valid string index after `i`. -Returns a value greater than `endof(str)` at or after the end of the string. -If the `nchar` argument is given the function goes forward `nchar` characters. +If `i` is in bounds in `s` return the index of the start of the character whose +encoding starts after index `i`. If `i` is out of bounds in `s` return `i + 1`. +If `n == 0` return `i`. # Examples ```jldoctest @@ -353,48 +423,19 @@ julia> nextind(str, 9) 10 ``` """ -function nextind(s::AbstractString, i::Integer) - e = endof(s) +function nextind(s::AbstractString, i::Integer, n::Integer=1) + n < 0 && throw(ArgumentError("n cannot be negative: $n")) if i < 1 - return 1 + n += i - 1 + i = 1 end - if i > e - return Int(i)+1 + z = ncodeunits(s) + while n > 0 && i < z + @inbounds n -= isvalid(s, i += 1) end - for j = Int(i)+1:e - if isvalid(s,j) - return j - end - end - next(s,e)[2] # out of range + return i + n end -function nextind(s::AbstractString, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - e = endof(s) - j = Int(i) - while nchar > 0 - if j < 1 - j = 1 - else - j > e && return j + nchar - j == e && return next(s,e)[2] + nchar - 1 - for outer j = j+1:e - isvalid(s,j) && break - end - end - nchar -= 1 - end - j -end - -checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i)) -checkbounds(s::AbstractString, r::AbstractRange{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r)) -# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer -checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I) -checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I) - - """ ind2chr(s::AbstractString, i::Integer) @@ -414,10 +455,7 @@ julia> chr2ind(str, 2) 3 ``` """ -function ind2chr(s::AbstractString, i::Integer) - s[i] # throws error if invalid - unsafe_ind2chr(s, i) -end +ind2chr(s::AbstractString, i::Integer) = length(s, 1, i) """ chr2ind(s::AbstractString, i::Integer) @@ -437,26 +475,10 @@ julia> ind2chr(str, 3) 2 ``` """ -function chr2ind(s::AbstractString, i::Integer) - i < start(s) && throw(BoundsError(s, i)) - k = unsafe_chr2ind(s, i) - s[k] # throws error if invalid - k -end - -function map_chr_ind(s::AbstractString, i::Integer, stop, ret) - j = 1 - k = start(s) - while true - i == stop((j, k)) && return ret((j, k)) # k could point after the last character - _, k = next(s, k) - j += 1 - end -end - -unsafe_ind2chr(s::AbstractString, i::Integer) = map_chr_ind(s, i, last, first) -unsafe_chr2ind(s::AbstractString, i::Integer) = map_chr_ind(s, i, first, last) +chr2ind(s::AbstractString, n::Integer) = + n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n) +## string index iteration type ## struct EachStringIndex{T<:AbstractString} s::T @@ -490,13 +512,9 @@ julia> isascii("αβγ") false ``` """ -isascii(c::Char) = c < Char(0x80) +isascii(c::Char) = reinterpret(Int32, c) ≥ 0 isascii(s::AbstractString) = all(isascii, s) -## string promotion rules ## - -promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String - """ isxdigit(c::Char) -> Bool @@ -512,12 +530,12 @@ julia> isxdigit('x') false ``` """ -isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' +isxdigit(c::Char) = '0' ≤ c ≤ '9' || 'a' ≤ c ≤ 'f' || 'A' ≤ c ≤ 'F' ## uppercase, lowercase, and titlecase transformations ## """ - uppercase(s::AbstractString) + uppercase(s::AbstractString) -> String Return `s` with all characters converted to uppercase. @@ -530,7 +548,7 @@ julia> uppercase("Julia") uppercase(s::AbstractString) = map(uppercase, s) """ - lowercase(s::AbstractString) + lowercase(s::AbstractString) -> String Return `s` with all characters converted to lowercase. @@ -543,7 +561,7 @@ julia> lowercase("STRINGS AND THINGS") lowercase(s::AbstractString) = map(lowercase, s) """ - titlecase(s::AbstractString) + titlecase(s::AbstractString) -> String Capitalize the first character of each word in `s`. See also [`ucfirst`](@ref) to capitalize only the first @@ -551,7 +569,7 @@ character in `s`. # Examples ```jldoctest -julia> titlecase("the julia programming language") +julia> titlecase("the Julia programming language") "The Julia Programming Language" ``` """ @@ -571,12 +589,13 @@ function titlecase(s::AbstractString) end """ - ucfirst(s::AbstractString) + ucfirst(s::AbstractString) -> String + +Return `s` with the first character converted to uppercase (technically "title +case" for Unicode). See also [`titlecase`](@ref) to capitalize the first +character of every word in `s`. -Return `string` with the first character converted to uppercase -(technically "title case" for Unicode). -See also [`titlecase`](@ref) to capitalize the first character of -every word in `s`. +See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase` # Examples ```jldoctest @@ -585,16 +604,19 @@ julia> ucfirst("python") ``` """ function ucfirst(s::AbstractString) - isempty(s) && return s + isempty(s) && return "" c = s[1] - tc = titlecase(c) - return c==tc ? s : string(tc,s[nextind(s,1):end]) + c′ = titlecase(c) + c == c′ ? convert(String, s) : + string(c′, SubString(s, nextind(s, 1))) end """ lcfirst(s::AbstractString) -Return `string` with the first character converted to lowercase. +Return `s` with the first character converted to lowercase. + +See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase` # Examples ```jldoctest @@ -603,31 +625,33 @@ julia> lcfirst("Julia") ``` """ function lcfirst(s::AbstractString) - isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) + isempty(s) && return "" + c = s[1] + c′ = lowercase(c) + c == c′ ? convert(String, s) : + string(c′, SubString(s, nextind(s, 1))) end ## string map, filter, has ## function map(f, s::AbstractString) - out = IOBuffer(StringVector(endof(s)),true,true) - truncate(out,0) + out = IOBuffer(StringVector(endof(s)), true, true) + truncate(out, 0) for c in s - c2 = f(c) - if !isa(c2,Char) - throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead")) - end - write(out, c2::Char) + c′ = f(c) + isa(c′, Char) || throw(ArgumentError( + "map(f, s::AbstractString) requires f to return Char; " * + "try map(f, collect(s)) or a comprehension instead")) + write(out, c′::Char) end String(take!(out)) end function filter(f, s::AbstractString) - out = IOBuffer(StringVector(endof(s)),true,true) - truncate(out,0) + out = IOBuffer(StringVector(endof(s)), true, true) + truncate(out, 0) for c in s - if f(c) - write(out, c) - end + f(c) && write(out, c) end String(take!(out)) end @@ -635,9 +659,9 @@ end ## string first and last ## """ - first(str::AbstractString, nchar::Integer) + first(s::AbstractString, n::Integer) -Get a string consisting of the first `nchar` characters of `str`. +Get a string consisting of the first `n` characters of `s`. ```jldoctest julia> first("∀ϵ≠0: ϵ²>0", 0) @@ -650,17 +674,12 @@ julia> first("∀ϵ≠0: ϵ²>0", 3) "∀ϵ≠" ``` """ -function first(str::AbstractString, nchar::Integer) - if 0 <= nchar <= 1 - return str[1:nchar] - end - str[1:nextind(str, 1, nchar-1)] -end +first(s::AbstractString, n::Integer) = s[1:min(end, nextind(s, 0, n))] """ - last(str::AbstractString, nchar::Integer) + last(s::AbstractString, n::Integer) -Get a string consisting of the last `nchar` characters of `str`. +Get a string consisting of the last `n` characters of `s`. ```jldoctest julia> last("∀ϵ≠0: ϵ²>0", 0) @@ -673,13 +692,54 @@ julia> last("∀ϵ≠0: ϵ²>0", 3) "²>0" ``` """ -function last(str::AbstractString, nchar::Integer) - e = endof(str) - if 0 <= nchar <= 1 - return str[(e-nchar+1):e] - end - str[prevind(str, e, nchar-1):e] -end +last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):end] + +""" + reverseind(v, i) + +Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that +`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains +non-ASCII characters.) + +# Examples +```jldoctest +julia> r = reverse("Julia") +"ailuJ" + +julia> for i in 1:length(r) + print(r[reverseind("Julia", i)]) + end +Julia +``` +""" +reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) + +""" + repeat(s::AbstractString, r::Integer) + +Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^). + +# Examples +```jldoctest +julia> repeat("ha", 3) +"hahaha" +``` +""" +repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r) + +""" + ^(s::Union{AbstractString,Char}, n::Integer) + +Repeat a string or character `n` times. +The [`repeat`](@ref) function is an alias to this operator. + +# Examples +```jldoctest +julia> "Test "^3 +"Test Test Test " +``` +""" +(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s, r) # reverse-order iteration for strings and indices thereof start(r::Iterators.Reverse{<:AbstractString}) = endof(r.itr) diff --git a/base/strings/io.jl b/base/strings/io.jl index a346c3d10f400..c305a1328370f 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -140,7 +140,7 @@ write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); en show(io::IO, s::AbstractString) = print_quoted(io, s) write(to::GenericIOBuffer, s::SubString{String}) = - s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1)) + s.ncodeunits ≤ 0 ? 0 : unsafe_write(to, pointer(s.string, s.offset+1), UInt(s.ncodeunits)) ## printing literal quoted string data ## @@ -253,6 +253,8 @@ need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1]) escape_nul(s::AbstractString, i::Int) = !done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0" +# TODO: handle escaping invalid UTF-8 + """ escape_string(str::AbstractString[, esc::AbstractString]) -> AbstractString @@ -272,15 +274,23 @@ function escape_string(io, s::AbstractString, esc::AbstractString="") i = start(s) while !done(s,i) c, j = next(s,i) - c == '\0' ? print(io, escape_nul(s,j)) : - c == '\e' ? print(io, "\\e") : - c == '\\' ? print(io, "\\\\") : - c in esc ? print(io, '\\', c) : - '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : - isprint(c) ? print(io, c) : - c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : - c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : - print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + if !ismalformed(c) + c == '\0' ? print(io, escape_nul(s,j)) : + c == '\e' ? print(io, "\\e") : + c == '\\' ? print(io, "\\\\") : + c in esc ? print(io, '\\', c) : + '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : + isprint(c) ? print(io, c) : + c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : + c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : + print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + else # malformed + u = bswap(reinterpret(UInt32, c)) + while true + print(io, "\\x", hex(u % UInt8, 2)) + (u >>= 8) == 0 && break + end + end i = j end end @@ -291,27 +301,10 @@ function print_quoted(io, s::AbstractString) print(io, '"') end -# bare minimum unescaping function unescapes only given characters - -function print_unescaped_chars(io, s::AbstractString, esc::AbstractString) - if !('\\' in esc) - esc = string("\\", esc) - end - i = start(s) - while !done(s,i) - c, i = next(s,i) - if c == '\\' && !done(s,i) && s[i] in esc - c, i = next(s,i) - end - print(io, c) - end -end - -unescape_chars(s::AbstractString, esc::AbstractString) = - sprint(endof(s), print_unescaped_chars, s, esc) - # general unescaping of traditional C and Unicode escape sequences +# TODO: handle unescaping invalid UTF-8 sequences + """ unescape_string(str::AbstractString) -> AbstractString @@ -335,16 +328,16 @@ function unescape_string(io, s::AbstractString) n = k = 0 m = c == 'x' ? 2 : c == 'u' ? 4 : 8 - while (k+=1) <= m && !done(s,i) + while (k += 1) <= m && !done(s,i) c, j = next(s,i) - n = '0' <= c <= '9' ? n<<4 + c-'0' : - 'a' <= c <= 'f' ? n<<4 + c-'a'+10 : - 'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break + n = '0' <= c <= '9' ? n<<4 + (c-'0') : + 'a' <= c <= 'f' ? n<<4 + (c-'a'+10) : + 'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break i = j end if k == 1 throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" : - "unicode (\\u)") escape sequence used in $(repr(s))")) + "unicode (\\u)") escape sequence used in $(repr(s))")) end if m == 2 # \x escape sequence write(io, UInt8(n)) @@ -354,7 +347,7 @@ function unescape_string(io, s::AbstractString) elseif '0' <= c <= '7' k = 1 n = c-'0' - while (k+=1) <= 3 && !done(s,i) + while (k += 1) <= 3 && !done(s,i) c, j = next(s,i) n = ('0' <= c <= '7') ? n<<3 + c-'0' : break i = j @@ -504,18 +497,7 @@ end function convert(::Type{String}, chars::AbstractVector{Char}) sprint(length(chars), io->begin - state = start(chars) - while !done(chars, state) - c, state = next(chars, state) - if '\ud7ff' < c && c + 1024 < '\ue000' - d, state = next(chars, state) - if '\ud7ff' < d - 1024 && d < '\ue000' - c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff)) - else - write(io, c) - c = d - end - end + for c in chars write(io, c) end end) diff --git a/base/strings/string.jl b/base/strings/string.jl index e66f876a5f77d..4683b0b0f4393 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -2,6 +2,8 @@ const ByteArray = Union{Vector{UInt8},Vector{Int8}} +@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi) + ## constructors and conversions ## # String constructor docstring from boot.jl, workaround for #16730 @@ -49,7 +51,6 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 byte This representation is often appropriate for passing strings to C. """ String(s::AbstractString) = print_to_string(s) - String(s::Symbol) = unsafe_string(Cstring(s)) (::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) @@ -59,48 +60,14 @@ String(s::Symbol) = unsafe_string(Cstring(s)) pointer(s::String) = unsafe_convert(Ptr{UInt8}, s) pointer(s::String, i::Integer) = pointer(s)+(i-1) -sizeof(s::String) = Core.sizeof(s) - -""" - codeunit(s::AbstractString, i::Integer) - -Get the `i`th code unit of an encoded string. For example, -returns the `i`th byte of the representation of a UTF-8 string. - -# Examples -```jldoctest -julia> s = "δ=γ"; [codeunit(s, i) for i in 1:sizeof(s)] -5-element Array{UInt8,1}: - 0xce - 0xb4 - 0x3d - 0xce - 0xb3 -``` -""" -codeunit(s::AbstractString, i::Integer) +ncodeunits(s::String) = Core.sizeof(s) +codeunit(s::String) = UInt8 @inline function codeunit(s::String, i::Integer) - @boundscheck if (i < 1) | (i > sizeof(s)) - throw(BoundsError(s,i)) - end + @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i)) @gc_preserve s unsafe_load(pointer(s, i)) end -""" - ncodeunits(s::AbstractString) - -The number of code units in a string. For example, for UTF-8-like data such as -the default `String` type, the number of code units is the number of bytes in -the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the -code unit is `UInt16` so the number of code units is the number of `UInt16` -words in the representation of the string. The expression `codeunit(s, i)` is -valid and safe for precisely the range of `i` values `1:ncodeunits(s)`. - -See also: [`codeunit`](@ref). -""" -ncodeunits(s::String) = sizeof(s) - write(io::IO, s::String) = @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s))) @@ -118,81 +85,45 @@ function ==(a::String, b::String) al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al) end -## thisind, prevind and nextind ## +## thisind, nextind, prevind ## -function thisind(s::String, i::Integer) - j = Int(i) - j < 1 && return 0 - n = ncodeunits(s) - j > n && return n + 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - j -end +thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i))) +nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i))) -function prevind(s::String, i::Integer) - j = Int(i) - e = sizeof(s) - if j > e - return endof(s) - end - j -= 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - j -end - -function prevind(s::String, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - j = Int(i) - e = sizeof(s) - while nchar > 0 - if j > e - j = endof(s) - else - j -= 1 - @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) - j -= 1 - end - end - nchar -= 1 - j <= 0 && return j - nchar - end - j -end - -function nextind(s::String, i::Integer) - j = Int(i) - if j < 1 - return 1 - end - e = sizeof(s) - j += 1 - @inbounds while j <= e && is_valid_continuation(codeunit(s,j)) - j += 1 - end - j +function thisind(s::String, i::Int) + n = ncodeunits(s) + between(i, 2, n) || return i + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || return i + @inbounds b = codeunit(s, i-1) + between(b, 0b11000000, 0b11110111) && return i-1 + (b & 0xc0 == 0x80) & (i-2 > 0) || return i + @inbounds b = codeunit(s, i-2) + between(b, 0b11100000, 0b11110111) && return i-2 + (b & 0xc0 == 0x80) & (i-3 > 0) || return i + @inbounds b = codeunit(s, i-3) + between(b, 0b11110000, 0b11110111) && return i-3 + return i end -function nextind(s::String, i::Integer, nchar::Integer) - nchar > 0 || throw(ArgumentError("nchar must be greater than 0")) - j = Int(i) - e = sizeof(s) - while nchar > 0 - if j < 1 - j = 1 - else - j += 1 - @inbounds while j <= e && is_valid_continuation(codeunit(s,j)) - j += 1 - end - end - nchar -= 1 - j > e && return j + nchar - end - j +function nextind(s::String, i::Int) + n = ncodeunits(s) + between(i, 1, n-1) || return i+1 + @inbounds l = codeunit(s, i) + (l < 0x80) | (0xf8 ≤ l) && return i+1 + if l < 0xc0 + i′ = thisind(s, i) + return i′ < i ? nextind(s, i′) : i+1 + end + # first continuation byte + @inbounds b = codeunit(s, i += 1) + (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i + # second continuation byte + @inbounds b = codeunit(s, i) + (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i + # third continuation byte + @inbounds b = codeunit(s, i) + ifelse(b & 0xc0 != 0x80, i, i+1) end ## checking UTF-8 & ACSII validity ## @@ -208,121 +139,146 @@ byte_string_classify(s::String) = isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0 isvalid(s::String) = isvalid(String, s) -## basic UTF-8 decoding & iteration ## - -is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) -is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) -is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800) -is_valid_continuation(c) = ((c & 0xc0) == 0x80) - -const utf8_offset = [ - 0x00000000, 0x00003080, - 0x000e2080, 0x03c82080, - 0xfa082080, 0x82082080, -] - -const utf8_trailing = [ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, -] +is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## -function endof(s::String) - i = sizeof(s) - @inbounds while i > 0 && is_valid_continuation(codeunit(s, i)) - i -= 1 - end - i +function next(s::String, i::Int) + @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i)) + @inbounds b = codeunit(s, i) + # TODO: check index validity + u = UInt32(b) << 24 + (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1 + return next_continued(s, i, u) end -function length(s::String) - cnum = 0 - @inbounds for i = 1:sizeof(s) - cnum += !is_valid_continuation(codeunit(s, i)) +@noinline function next_continued(s::String, i::Int, u::UInt32) + if u < 0xc0000000 + isvalid(s, i) && (i += 1; @goto ret) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) end - cnum + n = ncodeunits(s) + # first continuation byte + (i += 1) > n && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 16 + # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 8 + # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b); i += 1 +@label ret + return reinterpret(Char, u), i end -@noinline function slow_utf8_next(s::String, b::UInt8, i::Int, l::Int) - @inbounds if is_valid_continuation(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) - end - trailing = utf8_trailing[b + 1] - if l < i + trailing - return '\ufffd', i+1 - end - c::UInt32 = 0 - @inbounds for j = 1:(trailing + 1) - c <<= 6 - c += codeunit(s, i) - i += 1 - end - c -= utf8_offset[trailing + 1] - return Char(c), i +function getindex(s::String, i::Int) + @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i)) + @inbounds b = codeunit(s, i) + # TODO: check index validity + u = UInt32(b) << 24 + (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u) + return getindex_continued(s, i, u) end -# This implementation relies on `next` returning a value past the end of the -# String's underlying data, which is true for valid Strings -done(s::String, state) = state > sizeof(s) - -@inline function next(s::String, i::Int) - # function is split into this critical fast-path - # for pure ascii data, such as parsing numbers, - # and a longer function that can handle any utf8 data - @boundscheck if (i < 1) | (i > sizeof(s)) - throw(BoundsError(s,i)) +@noinline function getindex_continued(s::String, i::Int, u::UInt32) + if u < 0xc0000000 + isvalid(s, i) && @goto ret + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8)) end + n = ncodeunits(s) + # first continuation byte + (i += 1) > n && @goto ret @inbounds b = codeunit(s, i) - if b < 0x80 - return Char(b), i + 1 - end - return slow_utf8_next(s, b, i, sizeof(s)) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 16 + # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 8 + # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) +@label ret + return reinterpret(Char, u) end -function first_utf8_byte(ch::Char) - c = UInt32(ch) - b = c < 0x80 ? c%UInt8 : - c < 0x800 ? ((c>>6) | 0xc0)%UInt8 : - c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 : - ((c>>18) | 0xf0)%UInt8 - return b -end - -## overload methods for efficiency ## - -isvalid(s::String, i::Integer) = - (1 <= i <= sizeof(s)) && ((@inbounds b = codeunit(s, i)); !is_valid_continuation(b)) +getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] function getindex(s::String, r::UnitRange{Int}) isempty(r) && return "" - l = sizeof(s) - i = first(r) - if i < 1 || i > l - throw(BoundsError(s, i)) - end - @inbounds si = codeunit(s, i) - if is_valid_continuation(si) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si)) - end - j = last(r) - if j > l - throw(BoundsError(s, j)) - end - @inbounds sj = codeunit(s, j) - if is_valid_continuation(sj) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, sj)) + i, j = first(r), last(r) + @boundscheck begin + checkbounds(s, r) + @inbounds isvalid(s, i) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) + @inbounds isvalid(s, j) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + end + j = nextind(s, j) - 1 + n = j - i + 1 + ss = _string_n(n) + p = pointer(ss) + for k = 1:n + unsafe_store!(p, codeunit(s, i + k - 1), k) + end + return ss +end + +function length(s::String, lo::Int, hi::Int) + z = ncodeunits(s) + i = Int(max(1, min(z, lo))) + n = Int(min(z, max(1, hi))) + c = i - n + if i ≤ n + i, j = thisind(s, i), i + c -= i < j + i -= 1 + while true + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # lead byte + @label L + c += 1 + (0xc0 ≤ b) & (b < 0xf8) || continue + l = b + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 1 + b & 0xc0 == 0x80 || @goto L + l ≥ 0xe0 || continue + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 2 + b & 0xc0 == 0x80 || @goto L + l ≥ 0xf0 || continue + + (i += 1) ≤ n || break + @inbounds b = codeunit(s, i) # cont byte 3 + b & 0xc0 == 0x80 || @goto L + end end - j = nextind(s,j) - unsafe_string(pointer(s,i), j-i) + return c + hi - lo end +# TODO: delete or move to char.jl +first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8 + +## overload methods for efficiency ## + +function isvalid(s::String, i::Int) + @boundscheck checkbounds(s, i) + return thisind(s, i) == i +end +isvalid(s::String, i::Integer) = isvalid(s, Int(i)) + function search(s::String, c::Char, i::Integer = 1) if i < 1 || i > sizeof(s) i == sizeof(s) + 1 && return 0 @@ -331,11 +287,11 @@ function search(s::String, c::Char, i::Integer = 1) @inbounds if is_valid_continuation(codeunit(s,i)) throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i))) end - c < Char(0x80) && return search(s, c%UInt8, i) + c ≤ '\x7f' && return search(s, c % UInt8, i) while true i = search(s, first_utf8_byte(c), i) - (i==0 || s[i] == c) && return i - i = next(s,i)[2] + (i == 0 || s[i] == c) && return i + i = next(s, i)[2] end end @@ -361,12 +317,12 @@ function search(a::ByteArray, b::Char, i::Integer = 1) end function rsearch(s::String, c::Char, i::Integer = sizeof(s)) - c < Char(0x80) && return rsearch(s, c%UInt8, i) + c ≤ '\x7f' && return rsearch(s, c % UInt8, i) b = first_utf8_byte(c) while true i = rsearch(s, b, i) - (i==0 || s[i] == c) && return i - i = prevind(s,i) + (i == 0 || s[i] == c) && return i + i = prevind(s, i) end end @@ -411,62 +367,15 @@ function string(a::String...) end # UTF-8 encoding length of a character -function codelen(d::Char) - c = UInt32(d) - if c < 0x80 - return 1 - elseif c < 0x800 - return 2 - elseif c < 0x10000 - return 3 - elseif c < 0x110000 - return 4 - end - return 3 # '\ufffd' -end +# TODO: delete or move to char.jl +codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3) function string(a::Union{String,Char}...) - n = 0 - for d in a - if isa(d,Char) - n += codelen(d::Char) - else - n += sizeof(d::String) + sprint() do io + for x in a + write(io, x) end end - out = _string_n(n) - offs = 1 - p = pointer(out) - for d in a - if isa(d,Char) - c = UInt32(d::Char) - if c < 0x80 - unsafe_store!(p, c%UInt8, offs); offs += 1 - elseif c < 0x800 - unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - elseif c < 0x10000 - unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - elseif c < 0x110000 - unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 - else - # '\ufffd' - unsafe_store!(p, 0xef, offs); offs += 1 - unsafe_store!(p, 0xbf, offs); offs += 1 - unsafe_store!(p, 0xbd, offs); offs += 1 - end - else - l = sizeof(d::String) - unsafe_copy!(pointer(out,offs), pointer(d::String), l) - offs += l - end - end - return out end function repeat(s::String, r::Integer) diff --git a/base/strings/substring.jl b/base/strings/substring.jl index d1bf33e4123fb..2c75ed1c49444 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -22,13 +22,18 @@ julia> SubString("abc", 2) struct SubString{T<:AbstractString} <: AbstractString string::T offset::Int - endof::Int + ncodeunits::Int function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString - i > j && return new(s, i - 1, 0) # always allow i > j as it is consistent with getindex - isvalid(s, i) || throw(BoundsError(s, i)) - isvalid(s, j) || throw(BoundsError(s, j)) - new(s, i-1, j-i+1) + i ≤ j || return new(s, i-1, 0) + @boundscheck begin + checkbounds(s, i:j) + @inbounds isvalid(s, i) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i))) + @inbounds isvalid(s, j) || + throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j))) + end + return new(s, i-1, nextind(s,j)-i) end end @@ -37,11 +42,8 @@ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) = SubString(s, Int SubString(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, first(r), last(r)) function SubString(s::SubString, i::Int, j::Int) - # always allow i > j as it is consistent with getindex - i > j && return SubString(s.string, s.offset + i, s.offset + j) - i >= 1 || throw(BoundsError(s, i)) - j <= endof(s) || throw(BoundsError(s, j)) - SubString(s.string, s.offset + i, s.offset + j) + @boundscheck i ≤ j && checkbounds(s, i:j) + SubString(s.string, s.offset+i, s.offset+j) end SubString(s::AbstractString) = SubString(s, 1, endof(s)) @@ -50,78 +52,56 @@ SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s)) convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} = SubString(convert(S, s)) -String(p::SubString{String}) = - unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1) +String(s::SubString{String}) = unsafe_string(pointer(s.string, s.offset+1), s.ncodeunits) -sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1 +ncodeunits(s::SubString) = s.ncodeunits +codeunit(s::SubString) = codeunit(s.string) +length(s::SubString) = length(s.string, s.offset+1, s.offset+s.ncodeunits) -# TODO: length(s::SubString) = ?? -# default implementation will work but it's slow -# can this be delegated efficiently somehow? -# that may require additional string interfaces -function length(s::SubString{String}) - return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t), - pointer(s), nextind(s, s.endof) - 1)) +function codeunit(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return codeunit(s.string, s.offset + i) end -function next(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - c, i = next(s.string, i+s.offset) - c, i-s.offset +function next(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds c, i = next(s.string, s.offset + i) + return c, i - s.offset end -function getindex(s::SubString, i::Int) - if i < 1 || i > s.endof - throw(BoundsError(s, i)) - end - getindex(s.string, i+s.offset) +function getindex(s::SubString, i::Integer) + @boundscheck checkbounds(s, i) + @inbounds return getindex(s.string, s.offset + i) end -endof(s::SubString) = s.endof - function isvalid(s::SubString, i::Integer) - return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i) + @boundscheck checkbounds(s, i) + @inbounds return isvalid(s.string, s.offset + i) end -function thisind(s::SubString{String}, i::Integer) - j = Int(i) - j < start(s) && return 0 - n = ncodeunits(s) - j > n && return n + 1 - offset = s.offset - str = s.string - j += offset - @inbounds while j > offset && is_valid_continuation(codeunit(str, j)) - j -= 1 - end - j - offset -end - -nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset -prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset - -function getindex(s::AbstractString, r::UnitRange{Int}) - checkbounds(s, r) || throw(BoundsError(s, r)) - SubString(s, first(r), last(r)) -end +thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset +nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset +prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset function cmp(a::SubString{String}, b::SubString{String}) na = sizeof(a) nb = sizeof(b) c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), - pointer(a), pointer(b), min(na,nb)) - c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb) + pointer(a), pointer(b), min(na, nb)) + return c < 0 ? -1 : c > 0 ? +1 : cmp(na, nb) end # don't make unnecessary copies when passing substrings to C functions cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s + function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8} convert(Ptr{R}, pointer(s.string)) + s.offset end +pointer(x::SubString{String}) = pointer(x.string) + x.offset +pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) + """ reverse(s::AbstractString) -> AbstractString @@ -156,53 +136,3 @@ function reverse(s::Union{String,SubString{String}})::String end end end - -""" - reverseind(v, i) - -Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that -`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains -non-ASCII characters.) - -# Examples -```jldoctest -julia> r = reverse("Julia") -"ailuJ" - -julia> for i in 1:length(r) - print(r[reverseind("Julia", i)]) - end -Julia -``` -""" -reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1) - -""" - repeat(s::AbstractString, r::Integer) - -Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^). - -# Examples -```jldoctest -julia> repeat("ha", 3) -"hahaha" -``` -""" -repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r) - -""" - ^(s::Union{AbstractString,Char}, n::Integer) - -Repeat a string or character `n` times. -The [`repeat`](@ref) function is an alias to this operator. - -# Examples -```jldoctest -julia> "Test "^3 -"Test Test Test " -``` -""" -(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s,r) - -pointer(x::SubString{String}) = pointer(x.string) + x.offset -pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) diff --git a/base/strings/utf8proc.jl b/base/strings/utf8proc.jl index cf30ec5b3aa6f..0c646b63c558d 100644 --- a/base/strings/utf8proc.jl +++ b/base/strings/utf8proc.jl @@ -3,7 +3,10 @@ # Various Unicode functionality from the utf8proc library module UTF8proc -import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase, titlecase +import Base: + show, ==, hash, string, Symbol, isless, length, eltype, start, next, + done, convert, isvalid, lowercase, uppercase, titlecase, + MalformedCharError, ismalformed export isgraphemebreak, category_code, category_abbrev, category_string @@ -118,7 +121,9 @@ const category_strings = [ "Other, control", "Other, format", "Other, surrogate", - "Other, private use" + "Other, private use", + "Invalid, too high", + "Malformed, bad data", ] const UTF8PROC_STABLE = (1<<1) @@ -155,10 +160,26 @@ end utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags) -function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) +function normalize_string( + s::AbstractString; + stable::Bool=false, + compat::Bool=false, + compose::Bool=true, + decompose::Bool=false, + stripignore::Bool=false, + rejectna::Bool=false, + newline2ls::Bool=false, + newline2ps::Bool=false, + newline2lf::Bool=false, + stripcc::Bool=false, + casefold::Bool=false, + lump::Bool=false, + stripmark::Bool=false, +) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) compat && (flags = flags | UTF8PROC_COMPAT) + # TODO: error if compose & decompose? if decompose flags = flags | UTF8PROC_DECOMPOSE elseif compose @@ -253,7 +274,10 @@ julia> textwidth('❤') 2 ``` """ -textwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) +function textwidth(c::Char) + ismalformed(c) && (c = '\ufffd') + Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) +end """ textwidth(s::AbstractString) @@ -268,17 +292,29 @@ julia> textwidth("March") """ textwidth(s::AbstractString) = mapreduce(textwidth, +, 0, s) -lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) -uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) -titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c)) +lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : + Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c)) +uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : + Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c)) +titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : + Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c)) ############################################################################ # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category -category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c) +function category_code(c::Char) + ismalformed(c) && return Cint(31) + (u = UInt32(c)) ≤ 0x10ffff || return Cint(30) + ccall(:utf8proc_category, Cint, (UInt32,), u) +end # more human-readable representations of the category code -category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c)) +function category_abbrev(c) + ismalformed(c) && return "Ma" + (u = UInt32(c)) ≤ 0x10ffff || return "In" + unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u)) +end + category_string(c) = category_strings[category_code(c)+1] """ @@ -318,7 +354,7 @@ julia> islower('❤') false ``` """ -islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) +islower(c::Char) = category_code(c) == UTF8PROC_CATEGORY_LL # true for Unicode upper and mixed case @@ -342,8 +378,8 @@ false ``` """ function isupper(c::Char) - ccode = category_code(c) - return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT + cat = category_code(c) + cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT end """ @@ -363,7 +399,7 @@ julia> isdigit('α') false ``` """ -isdigit(c::Char) = ('0' <= c <= '9') +isdigit(c::Char) = '0' <= c <= '9' """ isalpha(c::Char) -> Bool @@ -384,7 +420,7 @@ julia> isalpha('9') false ``` """ -isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO) +isalpha(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO """ isnumber(c::Char) -> Bool @@ -405,7 +441,7 @@ julia> isnumber('❤') false ``` """ -isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) +isnumber(c::Char) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO """ isalnum(c::Char) -> Bool @@ -427,9 +463,9 @@ true ``` """ function isalnum(c::Char) - ccode = category_code(c) - return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) || - (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO) + cat = category_code(c) + UTF8PROC_CATEGORY_LU <= cat <= UTF8PROC_CATEGORY_LO || + UTF8PROC_CATEGORY_ND <= cat <= UTF8PROC_CATEGORY_NO end # following C++ only control characters from the Latin-1 subset return true @@ -449,7 +485,7 @@ julia> iscntrl('a') false ``` """ -iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) +iscntrl(c::Char) = c <= '\x1f' || '\x7f' <= c <= '\u9f' """ ispunct(c::Char) -> Bool @@ -469,7 +505,7 @@ julia> ispunct(';') true ``` """ -ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO) +ispunct(c::Char) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO # \u85 is the Unicode Next Line (NEL) character @@ -495,7 +531,9 @@ julia> isspace('\\x20') true ``` """ -@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS +@inline isspace(c::Char) = + c == ' ' || '\t' <= c <= '\r' || c == '\u85' || + '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS """ isprint(c::Char) -> Bool @@ -511,7 +549,7 @@ julia> isprint('A') true ``` """ -isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS) +isprint(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS # true in principal if a printer would use ink @@ -531,19 +569,26 @@ julia> isgraph('A') true ``` """ -isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO) +isgraph(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO ############################################################################ # iterators for grapheme segmentation isgraphemebreak(c1::Char, c2::Char) = + ismalformed(c1) || ismalformed(c2) || ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2) # Stateful grapheme break required by Unicode-9 rules: the string # must be processed in sequence, with state initialized to Ref{Int32}(0). # Requires utf8proc v2.0 or later. -isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) = - ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state) +function isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) + if ismalformed(c1) || ismalformed(c2) + state[] = 0 + return true + end + ccall(:utf8proc_grapheme_break_stateful, Bool, + (UInt32, UInt32, Ref{Int32}), c1, c2, state) +end struct GraphemeIterator{S<:AbstractString} s::S # original string (for generation of SubStrings) @@ -563,7 +608,7 @@ eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S} eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S} function length(g::GraphemeIterator) - c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) + c0 = typemax(Char) n = 0 state = Ref{Int32}(0) for c in g.s diff --git a/base/strings/util.jl b/base/strings/util.jl index db230a16da0c6..8bf3c8e2aadc9 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -58,10 +58,12 @@ function endswith(a::AbstractString, b::AbstractString) end endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars -startswith(a::String, b::String) = - (sizeof(a) >= sizeof(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0) -startswith(a::Vector{UInt8}, b::Vector{UInt8}) = - (length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0) +# FIXME: check that end of `b` doesn't match a partial character in `a` +startswith(a::String, b::String) = sizeof(a) ≥ sizeof(b) && + ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0 + +startswith(a::Vector{UInt8}, b::Vector{UInt8}) = length(a) ≥ length(b) && + ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0 # TODO: fast endswith @@ -88,15 +90,9 @@ julia> chop(a, 5, 5) "" ``` """ -function chop(s::AbstractString, head::Integer, tail::Integer) - # negative values of head/tail will throw error in nextind/prevind - headidx = head == 0 ? start(s) : nextind(s, start(s), head) - tailidx = tail == 0 ? endof(s) : prevind(s, endof(s), tail) - SubString(s, headidx, tailidx) -end - -# no head/tail version left for performance reasons chop(s::AbstractString) = SubString(s, start(s), prevind(s, endof(s))) +chop(s::AbstractString, head::Integer, tail::Integer) = + SubString(s, nextind(s, start(s), head), prevind(s, endof(s), tail)) """ chomp(s::AbstractString) @@ -127,17 +123,6 @@ function chomp(s::String) end end -# NOTE: use with caution -- breaks the immutable string convention! -# TODO: this is hard to provide with the new representation -#function chomp!(s::String) -# if !isempty(s) && codeunit(s,sizeof(s)) == 0x0a -# n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 -# ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) -# end -# return s -#end -chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types - const _default_delims = [' ','\t','\n','\v','\f','\r'] """ @@ -449,6 +434,7 @@ replace(s::AbstractString, pat, f) = replace_new(String(s), pat, f, typemax(Int) # replace(s::AbstractString, pat, f, count::Integer=typemax(Int)) = # replace(String(s), pat, f, count) +# TODO: allow transform as the first argument to replace? # hex <-> bytes conversion @@ -550,7 +536,8 @@ end # check for pure ASCII-ness function ascii(s::String) - for (i, b) in enumerate(Vector{UInt8}(s)) + for i = 1:sizeof(s) + b = codeunit(s,i) b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))")) end return s diff --git a/src/ast.c b/src/ast.c index d54e5581fab89..fba225b231feb 100644 --- a/src/ast.c +++ b/src/ast.c @@ -557,7 +557,17 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m return (jl_value_t*)ex; } if (iscprim(e) && cp_class((cprim_t*)ptr(e)) == fl_ctx->wchartype) { - return jl_box32(jl_char_type, *(int32_t*)cp_data((cprim_t*)ptr(e))); + uint32_t c, u = *(uint32_t*)cp_data((cprim_t*)ptr(e)); + if (u < 0x80) { + c = u << 24; + } else { + c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) | + ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000); + c = u < 0x00000800 ? (c << 16) | 0xc0800000 : + u < 0x00010000 ? (c << 8) | 0xe0808000 : + (c << 0) | 0xf0808080 ; + } + return jl_box_char(c); } if (iscvalue(e) && cv_class((cvalue_t*)ptr(e)) == jl_ast_ctx(fl_ctx)->jvtype) { return *(jl_value_t**)cv_data((cvalue_t*)ptr(e)); diff --git a/src/datatype.c b/src/datatype.c index 41f5cdb62ac70..edf94df39591c 100644 --- a/src/datatype.c +++ b/src/datatype.c @@ -640,7 +640,6 @@ SIBOX_FUNC(int16, int16_t, 1) SIBOX_FUNC(int32, int32_t, 1) UIBOX_FUNC(uint16, uint16_t, 1) UIBOX_FUNC(uint32, uint32_t, 1) -UIBOX_FUNC(char, uint32_t, 1) UIBOX_FUNC(ssavalue, size_t, 1) UIBOX_FUNC(slotnumber, size_t, 1) #ifdef _P64 @@ -651,6 +650,17 @@ SIBOX_FUNC(int64, int64_t, 2) UIBOX_FUNC(uint64, uint64_t, 2) #endif +static jl_value_t *boxed_char_cache[128]; +JL_DLLEXPORT jl_value_t *jl_box_char(uint32_t x) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (0 < (int32_t)x) + return boxed_char_cache[x >> 24]; + jl_value_t *v = jl_gc_alloc(ptls, sizeof(void*), jl_char_type); + *(uint32_t*)jl_data_ptr(v) = x; + return v; +} + static jl_value_t *boxed_int8_cache[256]; JL_DLLEXPORT jl_value_t *jl_box_int8(int8_t x) { @@ -684,14 +694,16 @@ void jl_init_int32_int64_cache(void) void jl_init_box_caches(void) { int64_t i; + for(i=0; i < 128; i++) { + boxed_char_cache[i] = jl_permbox32(jl_char_type, i << 24); + } for(i=0; i < 256; i++) { - boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i); + boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i); } for(i=0; i < NBOX_C; i++) { boxed_int16_cache[i] = jl_permbox16(jl_int16_type, i-NBOX_C/2); boxed_uint16_cache[i] = jl_permbox16(jl_uint16_type, i); boxed_uint32_cache[i] = jl_permbox32(jl_uint32_type, i); - boxed_char_cache[i] = jl_permbox32(jl_char_type, i); boxed_uint64_cache[i] = jl_permbox64(jl_uint64_type, i); } } diff --git a/src/jl_uv.c b/src/jl_uv.c index 77719693eb943..4753655bbdd9d 100644 --- a/src/jl_uv.c +++ b/src/jl_uv.c @@ -490,10 +490,21 @@ JL_DLLEXPORT void jl_uv_putb(uv_stream_t *stream, uint8_t b) jl_uv_puts(stream, (char*)&b, 1); } -JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t wchar) +JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t c) { char s[4]; - jl_uv_puts(stream, s, u8_wc_toutf8(s, wchar)); + int n = 1; + s[0] = c >> 24; + if ((s[1] = c >> 16)) { + n++; + if ((s[2] = c >> 8)) { + n++; + if ((s[3] = c)) { + n++; + } + } + } + jl_uv_puts(stream, s, n); } extern int vasprintf(char **str, const char *fmt, va_list ap); diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl index 916834c42bf3b..97d46e237343e 100644 --- a/stdlib/Test/src/Test.jl +++ b/stdlib/Test/src/Test.jl @@ -1396,8 +1396,11 @@ with string types besides the standard `String` type. struct GenericString <: AbstractString string::AbstractString end -Base.endof(s::GenericString) = endof(s.string) -Base.next(s::GenericString, i::Int) = next(s.string, i) +Base.ncodeunits(s::GenericString) = ncodeunits(s.string) +Base.codeunit(s::GenericString) = codeunit(s.string) +Base.codeunit(s::GenericString, i::Integer) = codeunit(s.string, i) +Base.isvalid(s::GenericString, i::Integer) = isvalid(s.string, i) +Base.next(s::GenericString, i::Integer) = next(s.string, i) Base.reverse(s::GenericString) = GenericString(reverse(s.string)) Base.reverse(s::SubString{GenericString}) = GenericString(typeof(s.string)(reverse(String(s)))) diff --git a/test/char.jl b/test/char.jl index c40f60de3be23..85b2acf5385ef 100644 --- a/test/char.jl +++ b/test/char.jl @@ -198,3 +198,25 @@ end @test sprint(show, "text/plain", '$') == "'\$': ASCII/Unicode U+0024 (category Sc: Symbol, currency)" @test repr('$') == "'\$'" + +@testset "read incomplete character at end of stream or file" begin + local file = tempname() + local iob = IOBuffer([0xf0]) + local bytes(c::Char) = Vector{UInt8}(string(c)) + @test bytes(read(iob, Char)) == [0xf0] + @test eof(iob) + try + write(file, 0xf0) + open(file) do io + @test bytes(read(io, Char)) == [0xf0] + @test eof(io) + end + let io = Base.Filesystem.open(file, Base.Filesystem.JL_O_RDONLY) + @test bytes(read(io, Char)) == [0xf0] + @test eof(io) + close(io) + end + finally + rm(file, force=true) + end +end diff --git a/test/intfuncs.jl b/test/intfuncs.jl index 779ce240add9a..062d1103c530f 100644 --- a/test/intfuncs.jl +++ b/test/intfuncs.jl @@ -134,7 +134,7 @@ end @test base(2, 5, 7) == "0000101" @test bitstring(Int16(3)) == "0000000000000011" - @test bitstring('3') == "00000000000000000000000000110011" + @test bitstring('3') == "00110011000000000000000000000000" @test bitstring(1035) == (Int == Int32 ? "00000000000000000000010000001011" : "0000000000000000000000000000000000000000000000000000010000001011") @test bitstring(Int128(3)) == "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011" diff --git a/test/lineedit.jl b/test/lineedit.jl index 51e2a692025f9..9da81487bd320 100644 --- a/test/lineedit.jl +++ b/test/lineedit.jl @@ -16,8 +16,8 @@ function new_state() LineEdit.init_state(term, ModalInterface([Prompt("test> ")])) end -charseek(buf, i) = seek(buf, Base.unsafe_chr2ind(content(buf), i+1)-1) -charpos(buf, pos=position(buf)) = Base.unsafe_ind2chr(content(buf), pos+1)-1 +charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1) +charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position buf = buffer(s) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 384da5d8a70f3..cbd26b89df3ce 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -99,14 +99,14 @@ end end @testset "issue #7248" begin - @test_throws BoundsError ind2chr("hello", -1) - @test_throws BoundsError chr2ind("hello", -1) - @test_throws BoundsError ind2chr("hellø", -1) - @test_throws BoundsError chr2ind("hellø", -1) - @test_throws BoundsError ind2chr("hello", 10) - @test_throws BoundsError chr2ind("hello", 10) - @test_throws BoundsError ind2chr("hellø", 10) - @test_throws BoundsError chr2ind("hellø", 10) + @test ind2chr("hello", -1) == -1 + @test chr2ind("hello", -1) == -1 + @test ind2chr("hellø", -1) == -1 + @test chr2ind("hellø", -1) == -1 + @test ind2chr("hello", 10) == 10 + @test chr2ind("hello", 10) == 10 + @test ind2chr("hellø", 10) == 9 + @test chr2ind("hellø", 10) == 11 @test_throws BoundsError checkbounds("hello", 0) @test_throws BoundsError checkbounds("hello", 6) @test_throws BoundsError checkbounds("hello", 0:3) @@ -127,7 +127,6 @@ end @test SubString("hellø", 1, 5)[10:9] == "" @test SubString("hellø", 1, 0)[10:9] == "" @test SubString("", 1, 0)[10:9] == "" - @test_throws BoundsError SubString("", 1, 6) @test_throws BoundsError SubString("", 1, 1) end @@ -143,8 +142,8 @@ end @test get(utf8_str, -1, 'X') == 'X' @test get(utf8_str, 1000, 'X') == 'X' - # Test that indexing into the middle of a character returns the default - @test get(utf8_str, 2, 'X') == 'X' + # Test that indexing into the middle of a character throws + @test_throws UnicodeError get(utf8_str, 2, 'X') end #= @@ -172,8 +171,10 @@ end # make sure substrings do not accept code unit if it is not start of codepoint let s = "x\u0302" + @test s[1:2] == s + @test_throws BoundsError s[0:3] + @test_throws BoundsError s[1:4] @test_throws UnicodeError s[1:3] - @test s[1:2]==s end @testset "issue #9781" begin @@ -204,8 +205,15 @@ struct tstStringType <: AbstractString end @testset "AbstractString functions" begin tstr = tstStringType(Vector{UInt8}("12")) - @test_throws ErrorException endof(tstr) - @test_throws ErrorException next(tstr, Bool(1)) + @test_throws MethodError ncodeunits(tstr) + @test_throws MethodError codeunit(tstr) + @test_throws MethodError codeunit(tstr, 1) + @test_throws MethodError codeunit(tstr, true) + @test_throws MethodError isvalid(tstr, 1) + @test_throws MethodError isvalid(tstr, true) + @test_throws MethodError next(tstr, 1) + @test_throws MethodError next(tstr, true) + @test_throws MethodError endof(tstr) gstr = GenericString("12") @test string(gstr) isa GenericString @@ -224,18 +232,19 @@ end @test done(eachindex("foobar"),7) @test eltype(Base.EachStringIndex) == Int @test map(uppercase, "foó") == "FOÓ" - @test chr2ind("fóobar",3) == 4 - - @test Symbol(gstr)==Symbol("12") + @test chr2ind("fóobar", 3) == 4 - @test_throws ErrorException sizeof(gstr) + @test Symbol(gstr) == Symbol("12") - @test length(GenericString(""))==0 + @test sizeof(gstr) == 2 + @test ncodeunits(gstr) == 2 + @test length(gstr) == 2 + @test length(GenericString("")) == 0 @test nextind(1:1, 1) == 2 @test nextind([1], 1) == 2 - @test ind2chr(gstr,2)==2 + @test ind2chr(gstr, 2) == 2 # tests promote_rule let svec = [s"12", GenericString("12"), SubString("123", 1, 2)] @@ -463,8 +472,8 @@ end foobar(ch) = Char(0xd800) foobaz(ch) = reinterpret(Char, typemax(UInt32)) @test_throws ArgumentError map(foomap, GenericString(str)) - @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17])) - @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17])) + @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[length(str)])) + @test map(foobaz, GenericString(str)) == String(repeat([0xff], outer=[4*length(str)])) @test "a".*["b","c"] == ["ab","ac"] @test ["b","c"].*"a" == ["ba","ca"] @@ -488,7 +497,7 @@ end @test_throws ArgumentError ascii(GenericString("Hello, ∀")) end @testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin - @test endof(String(b"\x90")) == 0 + @test endof(String(b"\x90")) == 1 @test endof(String(b"\xce")) == 1 end # issue #17624, missing getindex method for String @@ -570,7 +579,7 @@ end SubString("123∀α>β:α+1>β123", 4, 18), SubString(s"123∀α>β:α+1>β123", 4, 18)] for s in strs - @test thisind(s, -2) == 0 + @test thisind(s, -2) == -2 @test thisind(s, 0) == 0 @test thisind(s, 1) == 1 @test thisind(s, 2) == 1 @@ -581,13 +590,13 @@ end @test thisind(s, 15) == 15 @test thisind(s, 16) == 15 @test thisind(s, 17) == 17 - @test thisind(s, 30) == 17 + @test thisind(s, 30) == 30 end end let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)] for s in strs, i in -2:2 - @test thisind(s, i) == (i > 0) + @test thisind(s, i) == i end end end @@ -612,17 +621,18 @@ end @test prevind(strs[i], 15, 4) == 10 @test prevind(strs[i], 15, 10) == 0 @test prevind(strs[i], 15, 9) == 1 - @test prevind(strs[i], 15, 10) == 0 @test prevind(strs[i], 16) == 15 @test prevind(strs[i], 16, 1) == 15 @test prevind(strs[i], 16, 2) == 14 - @test prevind(strs[i], 20) == 15 - @test prevind(strs[i], 20, 1) == 15 - @test prevind(strs[i], 20, 10) == 1 - @test_throws ArgumentError prevind(strs[i], 20, 0) - - @test nextind(strs[i], -1) == 1 - @test nextind(strs[i], -1, 1) == 1 + @test prevind(strs[i], 20) == 19 + @test prevind(strs[i], 20, 1) == 19 + @test prevind(strs[i], 20, 10) == 7 + @test prevind(strs[i], 20, 0) == 20 + + @test nextind(strs[i], -1) == 0 + @test nextind(strs[i], -1, 1) == 0 + @test nextind(strs[i], -1, 2) == 1 + @test nextind(strs[i], -1, 3) == 4 @test nextind(strs[i], 0, 2) == 4 @test nextind(strs[i], 0, 20) == 26 @test nextind(strs[i], 0, 10) == 15 @@ -643,7 +653,7 @@ end @test nextind(strs[i], 15, 1) == 17 @test nextind(strs[i], 20) == 21 @test nextind(strs[i], 20, 1) == 21 - @test_throws ArgumentError nextind(strs[i], 20, 0) + @test nextind(strs[i], 20, 0) == 20 for x in -10:20 n = p = x @@ -658,8 +668,8 @@ end @test prevind(strs[1], -1) == -2 @test prevind(strs[1], -1, 1) == -2 - @test prevind(strs[2], -1) == 0 - @test prevind(strs[2], -1, 1) == 0 + @test prevind(strs[2], -1) == -2 + @test prevind(strs[2], -1, 1) == -2 end end @@ -672,7 +682,7 @@ end @test first(s, 3) == "∀ϵ≠" @test first(s, 4) == "∀ϵ≠0" @test first(s, length(s)) == s - @test_throws BoundsError first(s, length(s)+1) + @test first(s, length(s)+1) == s @test_throws ArgumentError last(s, -1) @test last(s, 0) == "" @test last(s, 1) == "0" @@ -680,21 +690,13 @@ end @test last(s, 3) == "²>0" @test last(s, 4) == "ϵ²>0" @test last(s, length(s)) == s - @test_throws BoundsError last(s, length(s)+1) + @test last(s, length(s)+1) == s end @testset "invalid code point" begin s = String([0x61, 0xba, 0x41]) @test !isvalid(s) - @test_throws UnicodeError s[2] - e = try - s[2] - catch e - e - end - b = IOBuffer() - show(b, e) - @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)" + @test s[2] == reinterpret(Char, UInt32(0xba) << 24) end @testset "ncodeunits" begin diff --git a/test/strings/io.jl b/test/strings/io.jl index 7ee325c252c11..c5bf0f3e2be36 100644 --- a/test/strings/io.jl +++ b/test/strings/io.jl @@ -172,8 +172,7 @@ myio = IOBuffer() join(myio, "", "", 1) @test isempty(take!(myio)) -@testset "unescape_chars" begin - @test Base.unescape_chars("\\t","t") == "t" +@testset "unescape_string ArgumentErrors" begin @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"xZ")) @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"777")) end diff --git a/test/strings/types.jl b/test/strings/types.jl index 12dd75a1bd421..00bac71f826b8 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -32,12 +32,21 @@ for idx in 0:1 end # Substring provided with invalid end index throws BoundsError -@test_throws BoundsError SubString("∀", 1, 2) -@test_throws BoundsError SubString("∀", 1, 3) +@test_throws UnicodeError SubString("∀", 1, 2) +@test_throws UnicodeError SubString("∀", 1, 3) @test_throws BoundsError SubString("∀", 1, 4) # Substring provided with invalid start index throws BoundsError -@test_throws BoundsError SubString("∀∀", 2:4) +@test SubString("∀∀", 1:1) == "∀" +@test SubString("∀∀", 1:4) == "∀∀" +@test SubString("∀∀", 4:4) == "∀" +@test_throws UnicodeError SubString("∀∀", 1:2) +@test_throws UnicodeError SubString("∀∀", 1:5) +@test_throws UnicodeError SubString("∀∀", 2:4) +@test_throws BoundsError SubString("∀∀", 0:1) +@test_throws BoundsError SubString("∀∀", 0:4) +@test_throws BoundsError SubString("∀∀", 1:7) +@test_throws BoundsError SubString("∀∀", 4:7) # tests for SubString of more than one multibyte `Char` string # we are consistent with `getindex` for `String` @@ -46,10 +55,12 @@ for idx in [0, 1, 4] @test SubString("∀∀", 4, idx) == "∀∀"[4:idx] end -# second index beyond endof("∀∀") -for idx in 5:8 +# index beyond endof("∀∀") +for idx in [2:3; 5:6] + @test_throws UnicodeError SubString("∀∀", 1, idx) +end +for idx in 7:8 @test_throws BoundsError SubString("∀∀", 1, idx) - @test_throws BoundsError SubString("∀∀", 4, idx) end let str="tempus fugit" #length(str)==12 @@ -65,13 +76,13 @@ let str="tempus fugit" #length(str)==12 ss=SubString(str,1:0) @test length(ss)==0 - @test_throws BoundsError SubString(str,14,20) #start indexing beyond source string length - @test_throws BoundsError SubString(str,10,16) #end indexing beyond source string length + @test_throws BoundsError SubString(str, 14, 20) #start indexing beyond source string length + @test_throws BoundsError SubString(str, 10, 16) #end indexing beyond source string length @test_throws BoundsError SubString("", 1, 4) #empty source string @test_throws BoundsError SubString("", 1, 1) #empty source string, identical start and end index @test_throws BoundsError SubString("", 10, 12) - @test SubString("",12,10) == "" + @test SubString("", 12, 10) == "" end @test SubString("foobar", big(1), big(3)) == "foo" @@ -83,7 +94,7 @@ let str = "aa\u2200\u2222bb" write(b, u) @test String(take!(b)) == "\u2200\u2222" - @test_throws BoundsError SubString(str, 4, 5) + @test_throws UnicodeError SubString(str, 4, 5) @test_throws BoundsError next(u, 0) @test_throws BoundsError next(u, 7) @test_throws BoundsError getindex(u, 0) @@ -147,64 +158,69 @@ end @test ismatch(Regex(""), SubString("",1,0)) # isvalid(), chr2ind() and ind2chr() for SubString{String} -let ss, s="lorem ipsum", - sdict=Dict(SubString(s,1,11)=>s, - SubString(s,1,6)=>"lorem ", - SubString(s,1,0)=>"", - SubString(s,2,4)=>"ore", - SubString(s,2,11)=>"orem ipsum", - SubString(s,15,14)=>"" - ) - for (ss,s) in sdict - local ss - for i in -1:12 - @test isvalid(ss,i)==isvalid(s,i) +let s = "lorem ipsum", sdict = Dict( + SubString(s, 1, 11) => "lorem ipsum", + SubString(s, 1, 6) => "lorem ", + SubString(s, 1, 0) => "", + SubString(s, 2, 4) => "ore", + SubString(s, 2, 11) => "orem ipsum", + SubString(s, 15, 14) => "", +) + for (ss, s) in sdict + @test ncodeunits(ss) == ncodeunits(s) + for i in -2:13 + if 1 ≤ i ≤ ncodeunits(ss) + @test isvalid(ss, i) == isvalid(s, i) + else + @test_throws BoundsError isvalid(ss, i) + @test_throws BoundsError isvalid(s, i) + end end - end - for (ss,s) in sdict - local ss - for i in 1:length(ss) - @test ind2chr(ss,i)==ind2chr(s,i) + for i in 1:ncodeunits(ss) + @test ind2chr(ss, i) == ind2chr(s, i) end end - for (ss,s) in sdict - local ss + for (ss, s) in sdict + @test length(ss) == length(s) for i in 1:length(ss) - @test chr2ind(ss,i)==chr2ind(s,i) + @test chr2ind(ss, i) == chr2ind(s, i) end end -end #let +end -#for isvalid(SubString{String}) +# for isvalid(SubString{String}) let s = "Σx + βz - 2" - for i in -1:(length(s)+2) - if isvalid(s, i) - ss=SubString(s,1,i) - # make sure isvalid gives equivalent results for SubString and String - @test isvalid(ss,i)==isvalid(s,i) - else - if i > 0 - @test_throws BoundsError SubString(s,1,i) + for i in -1:ncodeunits(s)+2 + if checkbounds(Bool, s, i) + if isvalid(s, i) + ss = SubString(s, 1, i) + for j = 1:ncodeunits(ss) + @test isvalid(ss, j) == isvalid(s, j) + end else - @test SubString(s,1,i) == "" + @test_throws UnicodeError SubString(s, 1, i) end + elseif i > 0 + @test_throws BoundsError SubString(s, 1, i) + else + @test SubString(s, 1, i) == "" end end end -let ss=SubString("hello",1,5) - @test_throws BoundsError ind2chr(ss, -1) - @test_throws BoundsError chr2ind(ss, -1) - @test_throws BoundsError chr2ind(ss, 10) - @test_throws BoundsError ind2chr(ss, 10) +let ss = SubString("hello", 1, 5) + @test ind2chr(ss, -1) == -1 + @test chr2ind(ss, -1) == -1 + @test chr2ind(ss, 10) == 10 + @test ind2chr(ss, 10) == 10 end # length(SubString{String}) performance specialization let s = "|η(α)-ϕ(κ)| < ε" - @test length(SubString(s,1,0))==length(s[1:0]) - @test length(SubString(s,4,4))==length(s[4:4]) - @test length(SubString(s,1,7))==length(s[1:7]) - @test length(SubString(s,4,11))==length(s[4:11]) + @test length(SubString(s, 1, 0)) == length(s[1:0]) + @test length(SubString(s, 4, 4)) == length(s[4:4]) + @test length(SubString(s, 1, 7)) == length(s[1:7]) + @test length(SubString(s, 4, 11)) == length(s[4:11]) end @testset "reverseind" for T in (String, SubString, GenericString) @@ -217,7 +233,8 @@ end @test c == s[reverseind(s, ri)] == r[ri] s = convert(T, string(prefix, prefix, c, suffix, suffix)) pre = convert(T, prefix) - sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) + sb = SubString(s, nextind(pre, endof(pre)), + endof(convert(T, string(prefix, prefix, c, suffix)))) r = reverse(sb) ri = search(r, c) @test c == sb[reverseind(sb, ri)] == r[ri] diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index a9db6316d2fa9..c65934217dfb9 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -1,24 +1,13 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -@testset "cesu8 input" begin - let ch = 0x10000 - for hi = 0xd800:0xdbff - for lo = 0xdc00:0xdfff - @test String(Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch)) - ch += 1 - end - end - end -end - @testset "string indexing" begin let str = String(b"this is a test\xed\x80") - @test next(str, 15) == ('\ufffd', 16) + @test next(str, 15) == (reinterpret(Char, 0xed800000), 17) @test_throws BoundsError getindex(str, 0:3) @test_throws BoundsError getindex(str, 17:18) @test_throws BoundsError getindex(str, 2:17) - @test_throws UnicodeError getindex(str, 16:17) - @test string(Char(0x110000)) == "\ufffd" + @test_throws BoundsError getindex(str, 16:17) + @test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80") end end @@ -36,12 +25,12 @@ end b"xyz\xf0\x80" => b"\xf0\x80zyx", b"xyz\xf0\x80\x80" => b"\xf0\x80\x80zyx", ] - @test_broken reverse(String(s)) == String(r) + @test reverse(String(s)) == String(r) end end @testset "string convert" begin @test String(b"this is a test\xed\x80\x80") == "this is a test\ud000" - ## Specifically check UTF-8 string whose lead byte is same as a surrogate + # Specifically check UTF-8 string whose lead byte is same as a surrogate @test String(b"\xed\x9f\xbf") == "\ud7ff" end