diff --git a/base/char.jl b/base/char.jl
index ea7334eb0679e..600c3f6272d55 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -1,8 +1,58 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
+struct MalformedCharError <: Exception
+    char::Char
+end
+struct CodePointError <: Exception
+    code::Integer
+end
+@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
+@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
+
+function ismalformed(c::Char)
+    u = reinterpret(UInt32, c)
+    l1 = leading_ones(u) << 3
+    t0 = trailing_zeros(u) & 56
+    (l1 == 8) | (l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0)
+end
+
+function convert(::Type{UInt32}, c::Char)
+    # TODO: use optimized inline LLVM
+    u = reinterpret(UInt32, c)
+    u < 0x80000000 && return reinterpret(UInt32, u >> 24)
+    l1 = leading_ones(u)
+    t0 = trailing_zeros(u) & 56
+    (l1 == 1) | (8l1 + t0 > 32) |
+    (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) &&
+        malformed_char(c)::Union{}
+    u &= 0xffffffff >> l1
+    u >>= t0
+    (u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
+    (u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
+end
+
+function convert(::Type{Char}, u::UInt32)
+    u < 0x80 && return reinterpret(Char, u << 24)
+    u < 0x00200000 || code_point_err(u)::Union{}
+    c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
+        ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
+    c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
+        u < 0x00010000 ? (c << 08) | 0xe0808000 :
+                         (c << 00) | 0xf0808080
+    reinterpret(Char, c)
+end
+
+function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
+    i = reinterpret(Int32, c)
+    i ≥ 0 ? ((i >>> 24) % T) : T(UInt32(c))
+end
+
+function convert(::Type{Char}, b::Union{Int8,UInt8})
+    0 ≤ b ≤ 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
+end
+
 convert(::Type{Char}, x::Number) = Char(UInt32(x))
-convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
 convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))
 
 rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
@@ -29,11 +79,9 @@ done(c::Char, state) = state
 isempty(c::Char) = false
 in(x::Char, y::Char) = x == y
 
-==(x::Char, y::Char) = UInt32(x) == UInt32(y)
-isless(x::Char, y::Char) = UInt32(x) < UInt32(y)
-
-const hashchar_seed = 0xd4d64234
-hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) ⊻ UInt64(h))
+==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
+isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
+hash(x::Char, h::UInt) = hash(reinterpret(UInt32, x), hash(Char, h))
 
 -(x::Char, y::Char) = Int(x) - Int(y)
 -(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
@@ -66,7 +114,7 @@ function show(io::IO, c::Char)
     end
     if isprint(c)
         write(io, 0x27, c, 0x27)
-    else
+    elseif !ismalformed(c)
         u = UInt32(c)
         write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
         d = max(2, 8 - (leading_zeros(u) >> 2))
@@ -74,13 +122,29 @@ function show(io::IO, c::Char)
             write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
         end
         write(io, 0x27)
+    else # malformed
+        write(io, 0x27)
+        u = reinterpret(UInt32, c)
+        while true
+            a = hex_chars[((u >> 28) & 0xf) + 1]
+            b = hex_chars[((u >> 24) & 0xf) + 1]
+            write(io, 0x5c, 'x', a, b)
+            (u <<= 8) == 0 && break
+        end
+        write(io, 0x27)
     end
     return
 end
 
 function show(io::IO, ::MIME"text/plain", c::Char)
     show(io, c)
-    u = UInt32(c)
-    print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
-    print(io, " (category ", UTF8proc.category_abbrev(c), ": ", UTF8proc.category_string(c), ")")
+    if !ismalformed(c)
+        u = UInt32(c)
+        print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
+    else
+        print(io, ": Malformed UTF-8")
+    end
+    abr = UTF8proc.category_abbrev(c)
+    str = UTF8proc.category_string(c)
+    print(io, " (category ", abr, ": ", str, ")")
 end
diff --git a/base/filesystem.jl b/base/filesystem.jl
index c5f8e4b10854d..6268d1d420752 100644
--- a/base/filesystem.jl
+++ b/base/filesystem.jl
@@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
     return ret % UInt8
 end
 
+function read(f::File, ::Type{Char})
+    b0 = read(f, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(f)
+            p = position(f)
+            b = read(f, UInt8)
+            if b & 0xc0 != 0x80
+                seek(f, p)
+                break
+            end
+            c |= UInt32(b) << s
+            s -= 8
+        end
+    end
+    return reinterpret(Char, c)
+end
+
 function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
     check_open(f)
     ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),
diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index abc1fd95b3e6a..76b45f90cf4e8 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
     @eval begin
         ($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
         ($sym)(x::Unsigned)         = ($sym)(x,1,false)
-        ($sym)(x::Char, p::Int)     = ($sym)(unsigned(x),p,false)
-        ($sym)(x::Char)             = ($sym)(unsigned(x),1,false)
+        ($sym)(x::Char, p::Int)     = ($sym)(UInt32(x),p,false)
+        ($sym)(x::Char)             = ($sym)(UInt32(x),1,false)
         ($sym)(x::Integer, p::Int)  = ($sym)(unsigned(abs(x)),p,x<0)
         ($sym)(x::Integer)          = ($sym)(unsigned(abs(x)),1,x<0)
     end
diff --git a/base/io.jl b/base/io.jl
index 4d7f745b126e5..030cd9c5698e0 100644
--- a/base/io.jl
+++ b/base/io.jl
@@ -432,25 +432,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
     end
 end
 
-
-function write(s::IO, ch::Char)
-    c = reinterpret(UInt32, ch)
-    if c < 0x80
-        return write(s, c%UInt8)
-    elseif c < 0x800
-        return (write(s, (( c >> 6          ) | 0xC0)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x10000
-        return (write(s, (( c >> 12         ) | 0xE0)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    elseif c < 0x110000
-        return (write(s, (( c >> 18         ) | 0xF0)%UInt8)) +
-               (write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (((c >> 6)  & 0x3F ) | 0x80)%UInt8)) +
-               (write(s, (( c        & 0x3F ) | 0x80)%UInt8))
-    else
-        return write(s, '\ufffd')
+function write(io::IO, c::Char)
+    u = bswap(reinterpret(UInt32, c))
+    n = 1
+    while true
+        write(io, u % UInt8)
+        (u >>= 8) == 0 && return n
+        n += 1
     end
 end
 
@@ -493,23 +481,20 @@ function read!(s::IO, a::Array{T}) where T
     return a
 end
 
-function read(s::IO, ::Type{Char})
-    ch = read(s, UInt8)
-    if ch < 0x80
-        return Char(ch)
-    end
-
-    # mimic utf8.next function
-    trailing = Base.utf8_trailing[ch+1]
-    c::UInt32 = 0
-    for j = 1:trailing
-        c += ch
-        c <<= 6
-        ch = read(s, UInt8)
+function read(io::IO, ::Type{Char})
+    b0 = read(io, UInt8)
+    l = 8(4-leading_ones(b0))
+    c = UInt32(b0) << 24
+    if l < 24
+        s = 16
+        while s ≥ l && !eof(io)
+            peek(io) & 0xc0 == 0x80 || break
+            b = read(io, UInt8)
+            c |= UInt32(b) << s
+            s -= 8
+        end
     end
-    c += ch
-    c -= Base.utf8_offset[trailing+1]
-    return Char(c)
+    return reinterpret(Char, c)
 end
 
 # readuntil_string is useful below since it has
@@ -517,7 +502,7 @@ end
 readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))
 
 function readuntil(s::IO, delim::Char)
-    if delim < Char(0x80)
+    if delim ≤ '\x7f'
         return readuntil_string(s, delim % UInt8)
     end
     out = IOBuffer()
@@ -598,7 +583,7 @@ function readuntil(io::IO, target::AbstractString)
     i = start(target)
     done(target, i) && return ""
     c, i = next(target, start(target))
-    if done(target, i) && c < Char(0x80)
+    if done(target, i) && c <= '\x7f'
         return readuntil_string(io, c % UInt8)
     end
     # decide how we can index target
@@ -625,14 +610,13 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
     return out
 end
 
-
 """
     readchomp(x)
 
-Read the entirety of `x` as a string and remove a single trailing newline.
-Equivalent to `chomp!(read(x, String))`.
+Read the entirety of `x` as a string and remove a single trailing newline
+if there is one. Equivalent to `chomp(read(x, String))`.
 """
-readchomp(x) = chomp!(read(x, String))
+readchomp(x) = chomp(read(x, String))
 
 # read up to nb bytes into nb, returning # bytes read
 
diff --git a/base/iostream.jl b/base/iostream.jl
index 117bf77e7f8a6..347b86ca10f34 100644
--- a/base/iostream.jl
+++ b/base/iostream.jl
@@ -315,12 +315,13 @@ end
 
 ## low-level calls ##
 
-write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+function write(s::IOStream, b::UInt8)
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
+    Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
+end
 
 function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
+    iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
     return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
 end
 
@@ -353,14 +354,6 @@ end
 
 ## text I/O ##
 
-function write(s::IOStream, c::Char)
-    if !iswritable(s)
-        throw(ArgumentError("write failed, IOStream is not writeable"))
-    end
-    Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
-end
-read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))
-
 take!(s::IOStream) =
     ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)
 
@@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
 end
 
 ## Character streams ##
-const _chtmp = Ref{Char}()
+
 function peekchar(s::IOStream)
-    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
+    chref = Ref{UInt32}()
+    if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
         return typemax(Char)
     end
-    return _chtmp[]
+    return Char(chref[])
 end
 
 function peek(s::IOStream)
     ccall(:ios_peekc, Cint, (Ptr{Void},), s)
 end
+
+function peek(s::IO)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
diff --git a/base/parse.jl b/base/parse.jl
index 87447ba0a0a90..086cf86e46515 100644
--- a/base/parse.jl
+++ b/base/parse.jl
@@ -224,12 +224,12 @@ end
 ## string to float functions ##
 
 tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 
 tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
-tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
+tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
 tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
 tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
 
diff --git a/base/regex.jl b/base/regex.jl
index 344730007d7ec..0d2ecc935297c 100644
--- a/base/regex.jl
+++ b/base/regex.jl
@@ -303,8 +303,12 @@ struct SubstitutionString{T<:AbstractString} <: AbstractString
     string::T
 end
 
-endof(s::SubstitutionString) = endof(s.string)
-next(s::SubstitutionString, idx::Int) = next(s.string, idx)
+ncodeunits(s::SubstitutionString) = ncodeunits(s.string)
+codeunit(s::SubstitutionString) = codeunit(s.string)
+codeunit(s::SubstitutionString, i::Integer) = codeunit(s.string, i)
+isvalid(s::SubstitutionString, i::Integer) = isvalid(s.string, i)
+next(s::SubstitutionString, i::Integer) = next(s.string, i)
+
 function show(io::IO, s::SubstitutionString)
     print(io, "s")
     show(io, s.string)
diff --git a/base/repl/REPLCompletions.jl b/base/repl/REPLCompletions.jl
index 3e5056d613f26..2c4ba328093fa 100644
--- a/base/repl/REPLCompletions.jl
+++ b/base/repl/REPLCompletions.jl
@@ -106,7 +106,7 @@ const sorted_keywords = [
     "primitive type", "quote", "return", "struct",
     "true", "try", "using", "while"]
 
-function complete_keyword(s::String)
+function complete_keyword(s::Union{String,SubString{String}})
     r = searchsorted(sorted_keywords, s)
     i = first(r)
     n = length(sorted_keywords)
diff --git a/base/stream.jl b/base/stream.jl
index 4cf2d753f67ef..ab06e16f64913 100644
--- a/base/stream.jl
+++ b/base/stream.jl
@@ -1148,6 +1148,14 @@ unmark(x::LibuvStream)   = unmark(x.buffer)
 reset(x::LibuvStream)    = reset(x.buffer)
 ismarked(x::LibuvStream) = ismarked(x.buffer)
 
+function peek(s::LibuvStream)
+    mark(s)
+    try read(s, UInt8)
+    finally
+        reset(s)
+    end
+end
+
 # BufferStream's are non-OS streams, backed by a regular IOBuffer
 mutable struct BufferStream <: LibuvStream
     buffer::IOBuffer
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
index 734f1cc6f9041..2d21a7ad5d609 100644
--- a/base/strings/basic.jl
+++ b/base/strings/basic.jl
@@ -1,57 +1,188 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-## core string functions ##
+"""
+The `AbstractString` type is the supertype of all string implementations in
+Julia. Strings are encodings of sequences of [Unicode](https://unicode.org/)
+code points as represented by the `Char` type. Julia makes a few assumptions
+about strings:
 
-endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")")
-next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)")
-next(s::AbstractString, i::Integer) = next(s,Int(i))
+* Strings are encoded in terms of fixed-size "code units"
+  * Code units can be extracted with `codeunit(s, i)`
+  * The first code unit has index `1`
+  * The last code unit has index `ncodeunits(s)`
+  * Any index `i` such that `1 ≤ i ≤ ncodeunits(s)` is in bounds
+* String indexing is done in terms of these code units:
+  * Characters are extracted by `s[i]` with a valid string index `i`
+  * Each `Char` in a string is encoded by one or more code units
+  * Only the index of the first code unit of a `Char` is a valid index
+  * The encoding of a `Char` is independent of what precedes or follows it
+  * String encodings are "self-synchronizing" – i.e. `isvalid(s,i)` is O(1)
 
-string() = ""
-string(s::AbstractString) = s
+Some string functions error if you use an out-of-bounds or invalid string index,
+including code unit extraction `codeunit(s,i)`, string indexing `s[i]`, and
+string iteration `next(s,i)`. Other string functions take a more relaxed
+approach to indexing and give you the closest valid string index when in-bounds,
+or when out-of-bounds, behave as if there were an infinite number of characters
+padding each side of the string. Usually these imaginary padding characters have
+code unit length `1`, but string types may choose different sizes. Relaxed
+indexing functions include those intended for index arithmetic: `thisind`,
+`nextind` and `prevind`. This model allows index arithmetic to work with out-of-
+bounds indices as intermediate values so long as one never uses them to retrieve
+a character, which often helps avoid needing to code around edge cases.
 
-(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s))
-(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s)
-(::Type{Vector{Char}})(s::AbstractString) = collect(s)
+See also: `codeunit`, `ncodeunits`, `thisind`, `nextind`, `prevind`
+"""
+AbstractString
 
-Symbol(s::AbstractString) = Symbol(String(s))
+## required string functions ##
 
-# string types are convertible
-convert(::Type{T}, s::T) where {T<:AbstractString} = s
-convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s)
+"""
+    ncodeunits(s::AbstractString) -> Int
 
-## generic supplied functions ##
+Return the number of code units in a string. Indices that are in bounds to
+access this string must satisfy `1 ≤ i ≤ ncodeunits(s)`. Not all such indices
+are valid – they may not be the start of a character, but they will return a
+code unit value when calling `codeunit(s,i)`.
 
-start(s::AbstractString) = 1
-done(s::AbstractString,i) = (i > endof(s))
-getindex(s::AbstractString, i::Int) = next(s,i)[1]
-getindex(s::AbstractString, i::Integer) = s[Int(i)]
-getindex(s::AbstractString, i::Colon) = s
-getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
-# TODO: handle other ranges with stride ±1 specially?
-getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
-    sprint(length(v), io->(for i in v; write(io,s[i]) end))
-getindex(s::AbstractString, v::AbstractVector{Bool}) =
-    throw(ArgumentError("logical indexing not supported for strings"))
+See also: `codeunit`, `checkbounds`, `sizeof`, `length`, `endof`
+"""
+ncodeunits(s::AbstractString)
 
-get(s::AbstractString, i::Integer, default) = isvalid(s,i) ? s[i] : default
+"""
+    codeunit(s::AbstractString) -> Type{<:Union{UInt8, UInt16, UInt32}}
+
+Return the code unit type of the given string object. For ASCII, Latin-1, or
+UTF-8 encoded strings, this would be `UInt8`; for UCS-2 and UTF-16 it would be
+`UInt16`; for UTF-32 it would be `UInt32`. The unit code type need not be
+limited to these three types, but it's hard to think of widely used string
+encodings that don't use one of these units. `codeunit(s)` is the same as
+`typeof(codeunit(s,1))` when `s` is a non-empty string.
 
+See also: `ncodeunits`
 """
-    sizeof(s::AbstractString)
+codeunit(s::AbstractString)
+
+"""
+    codeunit(s::AbstractString, i::Integer) -> Union{UInt8, UInt16, UInt32}
+
+Return the code unit value in the string `s` at index `i`. Note that
 
-The number of bytes in string `s`.
+    codeunit(s, i) :: codeunit(s)
+
+I.e. the value returned by `codeunit(s, i)` is of the type returned by
+`codeunit(s)`.
+
+See also: `ncodeunits`, `checkbounds`
+"""
+codeunit(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(codeunit, Tuple{typeof(s),Int})) :
+        codeunit(s, Int(i))
+
+"""
+    isvalid(s::AbstractString, i::Integer) -> Bool
+
+Predicate indicating whether the given index is the start of the encoding of
+a character in `s` or not. If `isvalid(s, i)` is true then `s[i]` will return
+the character whose encoding starts at that index, if it's false, then `s[i]`
+will raise an invalid index error. Behavior of `next(s, i)` is similar except
+that the character is returned along with the index of the following character.
+In order for `isvalid(s, i)` to be an O(1) function, the encoding of `s` must
+be [self-synchronizing](https://en.wikipedia.org/wiki/Self-synchronizing_code);
+this is a basic assumption of Julia's generic string support.
+
+See also: `getindex`, `next`, `thisind`, `nextind`, `prevind`, `length`
 
 # Examples
+
 ```jldoctest
-julia> sizeof("❤")
-3
+julia> str = "αβγdef";
+
+julia> isvalid(str, 1)
+true
+
+julia> str[1]
+'α': Unicode U+03b1 (category Ll: Letter, lowercase)
+
+julia> isvalid(str, 2)
+false
+
+julia> str[2]
+ERROR: UnicodeError: invalid character index
+Stacktrace:
+[...]
 ```
 """
-sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation")
+isvalid(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(isvalid, Tuple{typeof(s),Int})) :
+        isvalid(s, Int(i))
+
+"""
+    next(s::AbstractString, i::Integer) -> Tuple{Char, Int}
+
+Return a tuple of the character in `s` at index `i` with the index of the start
+of the following character in `s`. This is the key method that allows strings to
+be iterated, yielding a sequences of characters. If `i` is out of bounds in `s`
+then a bounds error is raised; if `i` is not a valid character index in `s` then
+a Unicode index error is raised.
+
+See also: `getindex`, `start`, `done`, `checkbounds`
+"""
+next(s::AbstractString, i::Integer) = typeof(i) === Int ?
+    throw(MethodError(next, Tuple{typeof(s),Int})) :
+        next(s, Int(i))
+
+## basic generic definitions ##
 
+start(s::AbstractString) = 1
+done(s::AbstractString, i::Integer) = i > ncodeunits(s)
 eltype(::Type{<:AbstractString}) = Char
+sizeof(s::AbstractString) = ncodeunits(s) * sizeof(codeunit(s))
+endof(s::AbstractString) = thisind(s, ncodeunits(s))
+
+getindex(s::AbstractString, i::Integer) = next(s, i)[1]
+getindex(s::AbstractString, i::Colon) = s
+# TODO: handle other ranges with stride ±1 specially?
+getindex(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, r)
+getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
+    sprint(length(v), io->(for i in v; write(io, s[i]) end))
+getindex(s::AbstractString, v::AbstractVector{Bool}) =
+    throw(ArgumentError("logical indexing not supported for strings"))
+
+get(s::AbstractString, i::Integer, default) = checkbounds(Bool, s, i) ? s[i] : default
+
+## bounds checking ##
+
+checkbounds(::Type{Bool}, s::AbstractString, i::Integer) =
+    1 ≤ i ≤ ncodeunits(s)
+checkbounds(::Type{Bool}, s::AbstractString, r::AbstractRange{<:Integer}) =
+    isempty(r) || (1 ≤ minimum(r) && maximum(r) ≤ ncodeunits(s))
+checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Real}) =
+    all(i -> checkbounds(s, i), I)
+checkbounds(::Type{Bool}, s::AbstractString, I::AbstractArray{<:Integer}) =
+    all(i -> checkbounds(s, i), I)
+checkbounds(s::AbstractString, I::Union{Integer,AbstractArray}) =
+    checkbounds(Bool, s, I) || throw(BoundsError(s, I))
+
+## construction, conversion, promotion ##
+
+string() = ""
+string(s::AbstractString) = s
+
+(::Type{Vector{UInt8}})(s::AbstractString) = Vector{UInt8}(String(s))
+(::Type{Array{UInt8}})(s::AbstractString) = Vector{UInt8}(s)
+(::Type{Vector{Char}})(s::AbstractString) = collect(s)
+
+Symbol(s::AbstractString) = Symbol(String(s))
+
+convert(::Type{T}, s::T) where {T<:AbstractString} = s
+convert(::Type{T}, s::AbstractString) where {T<:AbstractString} = T(s)
+
+promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
+
+## string & character concatenation ##
 
 """
-    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...)
+    *(s::Union{AbstractString, Char}, t::Union{AbstractString, Char}...) -> String
 
 Concatenate strings and/or characters, producing a [`String`](@ref). This is equivalent
 to calling the [`string`](@ref) function on the arguments.
@@ -69,49 +200,16 @@ julia> 'j' * "ulia"
 
 one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
 
-# generic number of code units; implementations generally know how long a string
-# is though and should override this with a more efficient method
-ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1
+## generic string comparison ##
 
 """
-    length(s::AbstractString)
+    cmp(a::AbstractString, b::AbstractString) -> Int
 
-The number of characters in string `s`.
-
-# Examples
-```jldoctest
-julia> length("jμΛIα")
-5
-```
-"""
-function length(s::AbstractString)
-    i = start(s)
-    if done(s,i)
-        return 0
-    end
-    n = 1
-    while true
-        c, j = next(s,i)
-        if done(s,j)
-            return n
-        end
-        n += 1
-        i = j
-    end
-end
-
-## string comparison functions ##
-"""
-    cmp(a::AbstractString, b::AbstractString)
-
-Compare two strings for equality.
-
-Return `0` if both strings have the same length and the character
-at each index is the same in both strings.
-Return `-1` if `a` is a substring of `b`, or if `a` comes before
-`b` in alphabetical order.
-Return `1` if `b` is a substring of `a`, or if `b` comes before
-`a` in alphabetical order.
+Compare two strings for equality. Return `0` if both strings have the same
+length and the character at each index is the same in both strings. Return `-1`
+if `a` is a substring of `b`, or if `a` comes before `b` in alphabetical order.
+Return `1` if `b` is a substring of `a`, or if `b` comes before `a` in
+alphabetical order (technically, lexicographical order by Unicode code points).
 
 # Examples
 ```jldoctest
@@ -138,28 +236,23 @@ julia> cmp("b", "β")
 ```
 """
 function cmp(a::AbstractString, b::AbstractString)
-    if a === b
-        return 0
-    end
+    a === b && return 0
     i = start(a)
     j = start(b)
-    while !done(a,i)
-        if done(b,j)
-            return +1
-        end
-        c, i = next(a,i)
-        d, j = next(b,j)
-        if c != d
-            return c < d ? -1 : +1
-        end
+    while !done(a, i)
+        done(b, j) && return 1
+        c, i = next(a, i)
+        d, j = next(b, j)
+        c ≠ d && return ifelse(c < d, -1, 1)
     end
-    done(b,j) ? 0 : -1
+    return ifelse(done(b, j), 0, -1)
 end
 
 """
-    ==(a::AbstractString, b::AbstractString)
+    ==(a::AbstractString, b::AbstractString) -> Bool
 
-Test whether two strings are equal character by character.
+Test whether two strings are equal character by character (technically, Unicode
+code point by code point).
 
 # Examples
 ```jldoctest
@@ -170,12 +263,13 @@ julia> "abc" == "αβγ"
 false
 ```
 """
-==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0
+==(a::AbstractString, b::AbstractString) = cmp(a, b) == 0
 
 """
-    isless(a::AbstractString, b::AbstractString)
+    isless(a::AbstractString, b::AbstractString) -> Bool
 
-Test whether string `a` comes before string `b` in alphabetical order.
+Test whether string `a` comes before string `b` in alphabetical order
+(technically, in lexicographical order by Unicode code points).
 
 # Examples
 ```jldoctest
@@ -189,64 +283,58 @@ julia> isless("a", "a")
 false
 ```
 """
-isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0
+isless(a::AbstractString, b::AbstractString) = cmp(a, b) < 0
 
 # faster comparisons for symbols
 
 cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b)))
 
-isless(a::Symbol, b::Symbol) = cmp(a,b) < 0
+isless(a::Symbol, b::Symbol) = cmp(a, b) < 0
 
-## Generic validation functions ##
+## character index arithmetic ##
 
 """
-    isvalid(str::AbstractString, i::Integer)
+    length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s)) -> Integer
+
+The number of characters in string `s` from indices `lo` through `hi`. This is
+computed as the number of code unit indices from `lo` to `hi` which are valid
+character indices. Without only a single string argument, this computes the
+number of characters in the entire string. If `lo` or `hi` are out of ranges
+each out of range code unit is considered to be one character. This matches the
+"loose" indexing model of `thisind`, `nextind` and `prevind`.
 
-Tell whether index `i` is valid for the given string.
+See also: `isvalid`, `ncodeunits`, `endof`, `thisind`, `nextind`, `prevind`
 
 # Examples
 ```jldoctest
-julia> str = "αβγdef";
-
-julia> isvalid(str, 1)
-true
-
-julia> str[1]
-'α': Unicode U+03b1 (category Ll: Letter, lowercase)
-
-julia> isvalid(str, 2)
-false
-
-julia> str[2]
-ERROR: UnicodeError: invalid character index
-Stacktrace:
-[...]
+julia> length("jμΛIα")
+5
 ```
 """
-function isvalid(s::AbstractString, i::Integer)
-    i < 1 && return false
-    done(s,i) && return false
-    try
-        next(s,i)
-        true
-    catch
-        false
+function length(s::AbstractString, lo::Integer=1, hi::Integer=ncodeunits(s))
+    z = ncodeunits(s)
+    a = Int(max(1, min(z, lo)))
+    b = Int(min(z, max(1, hi)))
+    n = a - b
+    for i = a:b
+        n += isvalid(s, i)
     end
+    return n + hi - lo
 end
 
-## Generic indexing functions ##
-
 """
-    thisind(s::AbstractString, i::Integer)
+    thisind(s::AbstractString, i::Integer) -> Int
 
-If `i` is the index into a character in `s` then `thisind` returns the index of the
-start of that character. If `i < start(s)` then it returns `start(s) - 1`.
-If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding code unit `i` is part of. In other words, if `i` is the start of a
+character, return `i`; if `i` is not the start of a character, rewind until the
+start of a character and return that index. If `i` is out of bounds in `s`
+return `i`.
 
 # Examples
 ```jldoctest
 julia> thisind("αβγdef", -5)
-0
+-5
 
 julia> thisind("αβγdef", 1)
 1
@@ -264,23 +352,24 @@ julia> thisind("αβγdef", 10)
 10
 
 julia> thisind("αβγdef", 20)
-10
+20
 """
 function thisind(s::AbstractString, i::Integer)
-    j = Int(i)
-    isvalid(s, j) && return j
-    j < start(s) && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    prevind(s, j)
+    i ≤ ncodeunits(s) || return i
+    @inbounds while 1 < i && !isvalid(s, i)
+        i -= 1
+    end
+    return i
 end
 
 """
-    prevind(str::AbstractString, i::Integer, nchar::Integer=1)
+    prevind(str::AbstractString, i::Integer, n::Integer=1) -> Int
 
-Get the previous valid string index before `i`.
-Returns a value less than `1` at the beginning of the string.
-If the `nchar` argument is given the function goes back `nchar` characters.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding starts before index `i`. In other words, if `i` is the start of a
+character, return the start of the previous character; if `i` is not the start
+of a character, rewind until the start of a character and return that index.
+If `i` is out of bounds in `s` return `i - 1`. If `n == 0` return `i`.
 
 # Examples
 ```jldoctest
@@ -290,51 +379,32 @@ julia> prevind("αβγdef", 3)
 julia> prevind("αβγdef", 1)
 0
 
+julia> prevind("αβγdef", 0)
+-1
+
 julia> prevind("αβγdef", 3, 2)
 0
 ```
 """
-function prevind(s::AbstractString, i::Integer)
-    e = endof(s)
-    if i > e
-        return e
+function prevind(s::AbstractString, i::Integer, n::Integer=1)
+    n < 0 && throw(ArgumentError("n cannot be negative: $n"))
+    z = ncodeunits(s) + 1
+    if i > z
+        n -= i - z
+        i = z
     end
-    j = Int(i)-1
-    while j >= 1
-        if isvalid(s,j)
-            return j
-        end
-        j -= 1
-    end
-    return 0 # out of range
-end
-
-function prevind(s::AbstractString, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    e = endof(s)
-    j = Int(i)
-    j < 1 && return 0
-    while nchar > 0
-        if j > e
-            j = e
-        else
-            j -= 1
-            while j >= 1 && !isvalid(s,j)
-                j -= 1
-            end
-        end
-        j < 1 && return 0
-        nchar -= 1
+    while n > 0 && 1 < i
+        @inbounds n -= isvalid(s, i -= 1)
     end
-    j
+    return i - n
 end
 
 """
-    nextind(str::AbstractString, i::Integer, nchar::Integer=1)
+    nextind(str::AbstractString, i::Integer, n::Integer=1) -> Int
 
-Get the next valid string index after `i`.
-Returns a value greater than `endof(str)` at or after the end of the string.
-If the `nchar` argument is given the function goes forward `nchar` characters.
+If `i` is in bounds in `s` return the index of the start of the character whose
+encoding starts after index `i`. If `i` is out of bounds in `s` return `i + 1`.
+If `n == 0` return `i`.
 
 # Examples
 ```jldoctest
@@ -353,48 +423,19 @@ julia> nextind(str, 9)
 10
 ```
 """
-function nextind(s::AbstractString, i::Integer)
-    e = endof(s)
+function nextind(s::AbstractString, i::Integer, n::Integer=1)
+    n < 0 && throw(ArgumentError("n cannot be negative: $n"))
     if i < 1
-        return 1
+        n += i - 1
+        i = 1
     end
-    if i > e
-        return Int(i)+1
+    z = ncodeunits(s)
+    while n > 0 && i < z
+        @inbounds n -= isvalid(s, i += 1)
     end
-    for j = Int(i)+1:e
-        if isvalid(s,j)
-            return j
-        end
-    end
-    next(s,e)[2] # out of range
+    return i + n
 end
 
-function nextind(s::AbstractString, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    e = endof(s)
-    j = Int(i)
-    while nchar > 0
-        if j < 1
-            j = 1
-        else
-            j > e && return j + nchar
-            j == e && return next(s,e)[2] + nchar - 1
-            for outer j = j+1:e
-                isvalid(s,j) && break
-            end
-        end
-        nchar -= 1
-    end
-    j
-end
-
-checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i))
-checkbounds(s::AbstractString, r::AbstractRange{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r))
-# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer
-checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I)
-checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I)
-
-
 """
     ind2chr(s::AbstractString, i::Integer)
 
@@ -414,10 +455,7 @@ julia> chr2ind(str, 2)
 3
 ```
 """
-function ind2chr(s::AbstractString, i::Integer)
-    s[i] # throws error if invalid
-    unsafe_ind2chr(s, i)
-end
+ind2chr(s::AbstractString, i::Integer) = length(s, 1, i)
 
 """
     chr2ind(s::AbstractString, i::Integer)
@@ -437,26 +475,10 @@ julia> ind2chr(str, 3)
 2
 ```
 """
-function chr2ind(s::AbstractString, i::Integer)
-    i < start(s) && throw(BoundsError(s, i))
-    k = unsafe_chr2ind(s, i)
-    s[k] # throws error if invalid
-    k
-end
-
-function map_chr_ind(s::AbstractString, i::Integer, stop, ret)
-    j = 1
-    k = start(s)
-    while true
-        i == stop((j, k)) && return ret((j, k)) # k could point after the last character
-        _, k = next(s, k)
-        j += 1
-    end
-end
-
-unsafe_ind2chr(s::AbstractString, i::Integer) = map_chr_ind(s, i, last, first)
-unsafe_chr2ind(s::AbstractString, i::Integer) = map_chr_ind(s, i, first, last)
+chr2ind(s::AbstractString, n::Integer) =
+    n < 0 ? prevind(s, 0, -n) : nextind(s, 0, n)
 
+## string index iteration type ##
 
 struct EachStringIndex{T<:AbstractString}
     s::T
@@ -490,13 +512,9 @@ julia> isascii("αβγ")
 false
 ```
 """
-isascii(c::Char) = c < Char(0x80)
+isascii(c::Char) = reinterpret(Int32, c) ≥ 0
 isascii(s::AbstractString) = all(isascii, s)
 
-## string promotion rules ##
-
-promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
-
 """
     isxdigit(c::Char) -> Bool
 
@@ -512,12 +530,12 @@ julia> isxdigit('x')
 false
 ```
 """
-isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F'
+isxdigit(c::Char) = '0' ≤ c ≤ '9' || 'a' ≤ c ≤ 'f' || 'A' ≤ c ≤ 'F'
 
 ## uppercase, lowercase, and titlecase transformations ##
 
 """
-    uppercase(s::AbstractString)
+    uppercase(s::AbstractString) -> String
 
 Return `s` with all characters converted to uppercase.
 
@@ -530,7 +548,7 @@ julia> uppercase("Julia")
 uppercase(s::AbstractString) = map(uppercase, s)
 
 """
-    lowercase(s::AbstractString)
+    lowercase(s::AbstractString) -> String
 
 Return `s` with all characters converted to lowercase.
 
@@ -543,7 +561,7 @@ julia> lowercase("STRINGS AND THINGS")
 lowercase(s::AbstractString) = map(lowercase, s)
 
 """
-    titlecase(s::AbstractString)
+    titlecase(s::AbstractString) -> String
 
 Capitalize the first character of each word in `s`.
 See also [`ucfirst`](@ref) to capitalize only the first
@@ -551,7 +569,7 @@ character in `s`.
 
 # Examples
 ```jldoctest
-julia> titlecase("the julia programming language")
+julia> titlecase("the Julia programming language")
 "The Julia Programming Language"
 ```
 """
@@ -571,12 +589,13 @@ function titlecase(s::AbstractString)
 end
 
 """
-    ucfirst(s::AbstractString)
+    ucfirst(s::AbstractString) -> String
+
+Return `s` with the first character converted to uppercase (technically "title
+case" for Unicode). See also [`titlecase`](@ref) to capitalize the first
+character of every word in `s`.
 
-Return `string` with the first character converted to uppercase
-(technically "title case" for Unicode).
-See also [`titlecase`](@ref) to capitalize the first character of
-every word in `s`.
+See also: `lcfirst`, `uppercase`, `lowercase`, `titlecase`
 
 # Examples
 ```jldoctest
@@ -585,16 +604,19 @@ julia> ucfirst("python")
 ```
 """
 function ucfirst(s::AbstractString)
-    isempty(s) && return s
+    isempty(s) && return ""
     c = s[1]
-    tc = titlecase(c)
-    return c==tc ? s : string(tc,s[nextind(s,1):end])
+    c′ = titlecase(c)
+    c == c′ ? convert(String, s) :
+    string(c′, SubString(s, nextind(s, 1)))
 end
 
 """
     lcfirst(s::AbstractString)
 
-Return `string` with the first character converted to lowercase.
+Return `s` with the first character converted to lowercase.
+
+See also: `ucfirst`, `uppercase`, `lowercase`, `titlecase`
 
 # Examples
 ```jldoctest
@@ -603,31 +625,33 @@ julia> lcfirst("Julia")
 ```
 """
 function lcfirst(s::AbstractString)
-    isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end])
+    isempty(s) && return ""
+    c = s[1]
+    c′ = lowercase(c)
+    c == c′ ? convert(String, s) :
+    string(c′, SubString(s, nextind(s, 1)))
 end
 
 ## string map, filter, has ##
 
 function map(f, s::AbstractString)
-    out = IOBuffer(StringVector(endof(s)),true,true)
-    truncate(out,0)
+    out = IOBuffer(StringVector(endof(s)), true, true)
+    truncate(out, 0)
     for c in s
-        c2 = f(c)
-        if !isa(c2,Char)
-            throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
-        end
-        write(out, c2::Char)
+        c′ = f(c)
+        isa(c′, Char) || throw(ArgumentError(
+            "map(f, s::AbstractString) requires f to return Char; " *
+            "try map(f, collect(s)) or a comprehension instead"))
+        write(out, c′::Char)
     end
     String(take!(out))
 end
 
 function filter(f, s::AbstractString)
-    out = IOBuffer(StringVector(endof(s)),true,true)
-    truncate(out,0)
+    out = IOBuffer(StringVector(endof(s)), true, true)
+    truncate(out, 0)
     for c in s
-        if f(c)
-            write(out, c)
-        end
+        f(c) && write(out, c)
     end
     String(take!(out))
 end
@@ -635,9 +659,9 @@ end
 ## string first and last ##
 
 """
-    first(str::AbstractString, nchar::Integer)
+    first(s::AbstractString, n::Integer)
 
-Get a string consisting of the first `nchar` characters of `str`.
+Get a string consisting of the first `n` characters of `s`.
 
 ```jldoctest
 julia> first("∀ϵ≠0: ϵ²>0", 0)
@@ -650,17 +674,12 @@ julia> first("∀ϵ≠0: ϵ²>0", 3)
 "∀ϵ≠"
 ```
 """
-function first(str::AbstractString, nchar::Integer)
-    if 0 <= nchar <= 1
-        return str[1:nchar]
-    end
-    str[1:nextind(str, 1, nchar-1)]
-end
+first(s::AbstractString, n::Integer) = s[1:min(end, nextind(s, 0, n))]
 
 """
-    last(str::AbstractString, nchar::Integer)
+    last(s::AbstractString, n::Integer)
 
-Get a string consisting of the last `nchar` characters of `str`.
+Get a string consisting of the last `n` characters of `s`.
 
 ```jldoctest
 julia> last("∀ϵ≠0: ϵ²>0", 0)
@@ -673,13 +692,54 @@ julia> last("∀ϵ≠0: ϵ²>0", 3)
 "²>0"
 ```
 """
-function last(str::AbstractString, nchar::Integer)
-    e = endof(str)
-    if 0 <= nchar <= 1
-        return str[(e-nchar+1):e]
-    end
-    str[prevind(str, e, nchar-1):e]
-end
+last(s::AbstractString, n::Integer) = s[max(1, prevind(s, ncodeunits(s)+1, n)):end]
+
+"""
+    reverseind(v, i)
+
+Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that
+`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains
+non-ASCII characters.)
+
+# Examples
+```jldoctest
+julia> r = reverse("Julia")
+"ailuJ"
+
+julia> for i in 1:length(r)
+           print(r[reverseind("Julia", i)])
+       end
+Julia
+```
+"""
+reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
+
+"""
+    repeat(s::AbstractString, r::Integer)
+
+Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^).
+
+# Examples
+```jldoctest
+julia> repeat("ha", 3)
+"hahaha"
+```
+"""
+repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r)
+
+"""
+    ^(s::Union{AbstractString,Char}, n::Integer)
+
+Repeat a string or character `n` times.
+The [`repeat`](@ref) function is an alias to this operator.
+
+# Examples
+```jldoctest
+julia> "Test "^3
+"Test Test Test "
+```
+"""
+(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s, r)
 
 # reverse-order iteration for strings and indices thereof
 start(r::Iterators.Reverse{<:AbstractString}) = endof(r.itr)
diff --git a/base/strings/io.jl b/base/strings/io.jl
index a346c3d10f400..c305a1328370f 100644
--- a/base/strings/io.jl
+++ b/base/strings/io.jl
@@ -140,7 +140,7 @@ write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); en
 show(io::IO, s::AbstractString) = print_quoted(io, s)
 
 write(to::GenericIOBuffer, s::SubString{String}) =
-    s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1))
+    s.ncodeunits ≤ 0 ? 0 : unsafe_write(to, pointer(s.string, s.offset+1), UInt(s.ncodeunits))
 
 ## printing literal quoted string data ##
 
@@ -253,6 +253,8 @@ need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1])
 escape_nul(s::AbstractString, i::Int) =
     !done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0"
 
+# TODO: handle escaping invalid UTF-8
+
 """
     escape_string(str::AbstractString[, esc::AbstractString]) -> AbstractString
 
@@ -272,15 +274,23 @@ function escape_string(io, s::AbstractString, esc::AbstractString="")
     i = start(s)
     while !done(s,i)
         c, j = next(s,i)
-        c == '\0'       ? print(io, escape_nul(s,j)) :
-        c == '\e'       ? print(io, "\\e") :
-        c == '\\'       ? print(io, "\\\\") :
-        c in esc        ? print(io, '\\', c) :
-        '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
-        isprint(c)      ? print(io, c) :
-        c <= '\x7f'     ? print(io, "\\x", hex(c, 2)) :
-        c <= '\uffff'   ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
-                          print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
+        if !ismalformed(c)
+            c == '\0'       ? print(io, escape_nul(s,j)) :
+            c == '\e'       ? print(io, "\\e") :
+            c == '\\'       ? print(io, "\\\\") :
+            c in esc        ? print(io, '\\', c) :
+            '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
+            isprint(c)      ? print(io, c) :
+            c <= '\x7f'     ? print(io, "\\x", hex(c, 2)) :
+            c <= '\uffff'   ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
+                              print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
+        else # malformed
+            u = bswap(reinterpret(UInt32, c))
+            while true
+                print(io, "\\x", hex(u % UInt8, 2))
+                (u >>= 8) == 0 && break
+            end
+        end
         i = j
     end
 end
@@ -291,27 +301,10 @@ function print_quoted(io, s::AbstractString)
     print(io, '"')
 end
 
-# bare minimum unescaping function unescapes only given characters
-
-function print_unescaped_chars(io, s::AbstractString, esc::AbstractString)
-    if !('\\' in esc)
-        esc = string("\\", esc)
-    end
-    i = start(s)
-    while !done(s,i)
-        c, i = next(s,i)
-        if c == '\\' && !done(s,i) && s[i] in esc
-            c, i = next(s,i)
-        end
-        print(io, c)
-    end
-end
-
-unescape_chars(s::AbstractString, esc::AbstractString) =
-    sprint(endof(s), print_unescaped_chars, s, esc)
-
 # general unescaping of traditional C and Unicode escape sequences
 
+# TODO: handle unescaping invalid UTF-8 sequences
+
 """
     unescape_string(str::AbstractString) -> AbstractString
 
@@ -335,16 +328,16 @@ function unescape_string(io, s::AbstractString)
                 n = k = 0
                 m = c == 'x' ? 2 :
                     c == 'u' ? 4 : 8
-                while (k+=1) <= m && !done(s,i)
+                while (k += 1) <= m && !done(s,i)
                     c, j = next(s,i)
-                    n = '0' <= c <= '9' ? n<<4 + c-'0' :
-                        'a' <= c <= 'f' ? n<<4 + c-'a'+10 :
-                        'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break
+                    n = '0' <= c <= '9' ? n<<4 + (c-'0') :
+                        'a' <= c <= 'f' ? n<<4 + (c-'a'+10) :
+                        'A' <= c <= 'F' ? n<<4 + (c-'A'+10) : break
                     i = j
                 end
                 if k == 1
                     throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
-                                            "unicode (\\u)") escape sequence used in $(repr(s))"))
+                                        "unicode (\\u)") escape sequence used in $(repr(s))"))
                 end
                 if m == 2 # \x escape sequence
                     write(io, UInt8(n))
@@ -354,7 +347,7 @@ function unescape_string(io, s::AbstractString)
             elseif '0' <= c <= '7'
                 k = 1
                 n = c-'0'
-                while (k+=1) <= 3 && !done(s,i)
+                while (k += 1) <= 3 && !done(s,i)
                     c, j = next(s,i)
                     n = ('0' <= c <= '7') ? n<<3 + c-'0' : break
                     i = j
@@ -504,18 +497,7 @@ end
 
 function convert(::Type{String}, chars::AbstractVector{Char})
     sprint(length(chars), io->begin
-        state = start(chars)
-        while !done(chars, state)
-            c, state = next(chars, state)
-            if '\ud7ff' < c && c + 1024 < '\ue000'
-                d, state = next(chars, state)
-                if '\ud7ff' < d - 1024 && d < '\ue000'
-                    c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
-                else
-                    write(io, c)
-                    c = d
-                end
-            end
+        for c in chars
             write(io, c)
         end
     end)
diff --git a/base/strings/string.jl b/base/strings/string.jl
index e66f876a5f77d..4683b0b0f4393 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -2,6 +2,8 @@
 
 const ByteArray = Union{Vector{UInt8},Vector{Int8}}
 
+@inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi)
+
 ## constructors and conversions ##
 
 # String constructor docstring from boot.jl, workaround for #16730
@@ -49,7 +51,6 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 byte
 This representation is often appropriate for passing strings to C.
 """
 String(s::AbstractString) = print_to_string(s)
-
 String(s::Symbol) = unsafe_string(Cstring(s))
 
 (::Type{Vector{UInt8}})(s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
@@ -59,48 +60,14 @@ String(s::Symbol) = unsafe_string(Cstring(s))
 pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
 pointer(s::String, i::Integer) = pointer(s)+(i-1)
 
-sizeof(s::String) = Core.sizeof(s)
-
-"""
-    codeunit(s::AbstractString, i::Integer)
-
-Get the `i`th code unit of an encoded string. For example,
-returns the `i`th byte of the representation of a UTF-8 string.
-
-# Examples
-```jldoctest
-julia> s = "δ=γ"; [codeunit(s, i) for i in 1:sizeof(s)]
-5-element Array{UInt8,1}:
- 0xce
- 0xb4
- 0x3d
- 0xce
- 0xb3
-```
-"""
-codeunit(s::AbstractString, i::Integer)
+ncodeunits(s::String) = Core.sizeof(s)
+codeunit(s::String) = UInt8
 
 @inline function codeunit(s::String, i::Integer)
-    @boundscheck if (i < 1) | (i > sizeof(s))
-        throw(BoundsError(s,i))
-    end
+    @boundscheck between(i, 1, ncodeunits(s)) || throw(BoundsError(s, i))
     @gc_preserve s unsafe_load(pointer(s, i))
 end
 
-"""
-    ncodeunits(s::AbstractString)
-
-The number of code units in a string. For example, for UTF-8-like data such as
-the default `String` type, the number of code units is the number of bytes in
-the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the
-code unit is `UInt16` so the number of code units is the number of `UInt16`
-words in the representation of the string. The expression `codeunit(s, i)` is
-valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
-
-See also: [`codeunit`](@ref).
-"""
-ncodeunits(s::String) = sizeof(s)
-
 write(io::IO, s::String) =
     @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))
 
@@ -118,81 +85,45 @@ function ==(a::String, b::String)
     al == sizeof(b) && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, al)
 end
 
-## thisind, prevind and nextind ##
+## thisind, nextind, prevind ##
 
-function thisind(s::String, i::Integer)
-    j = Int(i)
-    j < 1 && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-        j -= 1
-    end
-    j
-end
+thisind(s::String, i::Integer) = oftype(i, thisind(s, Int(i)))
+nextind(s::String, i::Integer) = oftype(i, nextind(s, Int(i)))
 
-function prevind(s::String, i::Integer)
-    j = Int(i)
-    e = sizeof(s)
-    if j > e
-        return endof(s)
-    end
-    j -= 1
-    @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-        j -= 1
-    end
-    j
-end
-
-function prevind(s::String, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    j = Int(i)
-    e = sizeof(s)
-    while nchar > 0
-        if j > e
-            j = endof(s)
-        else
-            j -= 1
-            @inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
-                j -= 1
-            end
-        end
-        nchar -= 1
-        j <= 0 && return j - nchar
-    end
-    j
-end
-
-function nextind(s::String, i::Integer)
-    j = Int(i)
-    if j < 1
-        return 1
-    end
-    e = sizeof(s)
-    j += 1
-    @inbounds while j <= e && is_valid_continuation(codeunit(s,j))
-        j += 1
-    end
-    j
+function thisind(s::String, i::Int)
+    n = ncodeunits(s)
+    between(i, 2, n) || return i
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || return i
+    @inbounds b = codeunit(s, i-1)
+    between(b, 0b11000000, 0b11110111) && return i-1
+    (b & 0xc0 == 0x80) & (i-2 > 0) || return i
+    @inbounds b = codeunit(s, i-2)
+    between(b, 0b11100000, 0b11110111) && return i-2
+    (b & 0xc0 == 0x80) & (i-3 > 0) || return i
+    @inbounds b = codeunit(s, i-3)
+    between(b, 0b11110000, 0b11110111) && return i-3
+    return i
 end
 
-function nextind(s::String, i::Integer, nchar::Integer)
-    nchar > 0 || throw(ArgumentError("nchar must be greater than 0"))
-    j = Int(i)
-    e = sizeof(s)
-    while nchar > 0
-        if j < 1
-            j = 1
-        else
-            j += 1
-            @inbounds while j <= e && is_valid_continuation(codeunit(s,j))
-                j += 1
-            end
-        end
-        nchar -= 1
-        j > e && return j + nchar
-    end
-    j
+function nextind(s::String, i::Int)
+    n = ncodeunits(s)
+    between(i, 1, n-1) || return i+1
+    @inbounds l = codeunit(s, i)
+    (l < 0x80) | (0xf8 ≤ l) && return i+1
+    if l < 0xc0
+        i′ = thisind(s, i)
+        return i′ < i ? nextind(s, i′) : i+1
+    end
+    # first continuation byte
+    @inbounds b = codeunit(s, i += 1)
+    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xe0) && return i
+    # second continuation byte
+    @inbounds b = codeunit(s, i)
+    (b & 0xc0 != 0x80) | ((i += 1) > n) | (l < 0xf0) && return i
+    # third continuation byte
+    @inbounds b = codeunit(s, i)
+    ifelse(b & 0xc0 != 0x80, i, i+1)
 end
 
 ## checking UTF-8 & ACSII validity ##
@@ -208,121 +139,146 @@ byte_string_classify(s::String) =
 isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
 isvalid(s::String) = isvalid(String, s)
 
-## basic UTF-8 decoding & iteration ##
-
-is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
-is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
-is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
-is_valid_continuation(c) = ((c & 0xc0) == 0x80)
-
-const utf8_offset = [
-    0x00000000, 0x00003080,
-    0x000e2080, 0x03c82080,
-    0xfa082080, 0x82082080,
-]
-
-const utf8_trailing = [
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
-]
+is_valid_continuation(c) = c & 0xc0 == 0x80
 
 ## required core functionality ##
 
-function endof(s::String)
-    i = sizeof(s)
-    @inbounds while i > 0 && is_valid_continuation(codeunit(s, i))
-        i -= 1
-    end
-    i
+function next(s::String, i::Int)
+    @boundscheck 1 ≤ i ≤ sizeof(s) || throw(BoundsError(s, i))
+    @inbounds b = codeunit(s, i)
+    # TODO: check index validity
+    u = UInt32(b) << 24
+    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u), i+1
+    return next_continued(s, i, u)
 end
 
-function length(s::String)
-    cnum = 0
-    @inbounds for i = 1:sizeof(s)
-        cnum += !is_valid_continuation(codeunit(s, i))
+@noinline function next_continued(s::String, i::Int, u::UInt32)
+    if u < 0xc0000000
+        isvalid(s, i) && (i += 1; @goto ret)
+        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
     end
-    cnum
+    n = ncodeunits(s)
+    # first continuation byte
+    (i += 1) > n && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 16
+    # second continuation byte
+    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 8
+    # third continuation byte
+    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b); i += 1
+@label ret
+    return reinterpret(Char, u), i
 end
 
-@noinline function slow_utf8_next(s::String, b::UInt8, i::Int, l::Int)
-    @inbounds if is_valid_continuation(b)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
-    end
-    trailing = utf8_trailing[b + 1]
-    if l < i + trailing
-        return '\ufffd', i+1
-    end
-    c::UInt32 = 0
-    @inbounds for j = 1:(trailing + 1)
-        c <<= 6
-        c += codeunit(s, i)
-        i += 1
-    end
-    c -= utf8_offset[trailing + 1]
-    return Char(c), i
+function getindex(s::String, i::Int)
+    @boundscheck 1 ≤ i ≤ ncodeunits(s) || throw(BoundsError(s, i))
+    @inbounds b = codeunit(s, i)
+    # TODO: check index validity
+    u = UInt32(b) << 24
+    (b < 0x80) | (0xf8 ≤ b) && return reinterpret(Char, u)
+    return getindex_continued(s, i, u)
 end
 
-# This implementation relies on `next` returning a value past the end of the
-# String's underlying data, which is true for valid Strings
-done(s::String, state) = state > sizeof(s)
-
-@inline function next(s::String, i::Int)
-    # function is split into this critical fast-path
-    # for pure ascii data, such as parsing numbers,
-    # and a longer function that can handle any utf8 data
-    @boundscheck if (i < 1) | (i > sizeof(s))
-        throw(BoundsError(s,i))
+@noinline function getindex_continued(s::String, i::Int, u::UInt32)
+    if u < 0xc0000000
+        isvalid(s, i) && @goto ret
+        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, (u >> 24) % UInt8))
     end
+    n = ncodeunits(s)
+    # first continuation byte
+    (i += 1) > n && @goto ret
     @inbounds b = codeunit(s, i)
-    if b < 0x80
-        return Char(b), i + 1
-    end
-    return slow_utf8_next(s, b, i, sizeof(s))
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 16
+    # second continuation byte
+    ((i += 1) > n) | (u < 0xe0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b) << 8
+    # third continuation byte
+    ((i += 1) > n) | (u < 0xf0000000) && @goto ret
+    @inbounds b = codeunit(s, i)
+    b & 0xc0 == 0x80 || @goto ret
+    u |= UInt32(b)
+@label ret
+    return reinterpret(Char, u)
 end
 
-function first_utf8_byte(ch::Char)
-    c = UInt32(ch)
-    b = c < 0x80    ? c%UInt8 :
-        c < 0x800   ? ((c>>6)  | 0xc0)%UInt8 :
-        c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 :
-                      ((c>>18) | 0xf0)%UInt8
-    return b
-end
-
-## overload methods for efficiency ##
-
-isvalid(s::String, i::Integer) =
-    (1 <= i <= sizeof(s)) && ((@inbounds b = codeunit(s, i)); !is_valid_continuation(b))
+getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
 
 function getindex(s::String, r::UnitRange{Int})
     isempty(r) && return ""
-    l = sizeof(s)
-    i = first(r)
-    if i < 1 || i > l
-        throw(BoundsError(s, i))
-    end
-    @inbounds si = codeunit(s, i)
-    if is_valid_continuation(si)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
-    end
-    j = last(r)
-    if j > l
-        throw(BoundsError(s, j))
-    end
-    @inbounds sj = codeunit(s, j)
-    if is_valid_continuation(sj)
-        throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, sj))
+    i, j = first(r), last(r)
+    @boundscheck begin
+        checkbounds(s, r)
+        @inbounds isvalid(s, i) ||
+            throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
+        @inbounds isvalid(s, j) ||
+            throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+    end
+    j = nextind(s, j) - 1
+    n = j - i + 1
+    ss = _string_n(n)
+    p = pointer(ss)
+    for k = 1:n
+        unsafe_store!(p, codeunit(s, i + k - 1), k)
+    end
+    return ss
+end
+
+function length(s::String, lo::Int, hi::Int)
+    z = ncodeunits(s)
+    i = Int(max(1, min(z, lo)))
+    n = Int(min(z, max(1, hi)))
+    c = i - n
+    if i ≤ n
+        i, j = thisind(s, i), i
+        c -= i < j
+        i -= 1
+        while true
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # lead byte
+        @label L
+            c += 1
+            (0xc0 ≤ b) & (b < 0xf8) || continue
+            l = b
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 1
+            b & 0xc0 == 0x80 || @goto L
+            l ≥ 0xe0 || continue
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 2
+            b & 0xc0 == 0x80 || @goto L
+            l ≥ 0xf0 || continue
+
+            (i += 1) ≤ n || break
+            @inbounds b = codeunit(s, i) # cont byte 3
+            b & 0xc0 == 0x80 || @goto L
+        end
     end
-    j = nextind(s,j)
-    unsafe_string(pointer(s,i), j-i)
+    return c + hi - lo
 end
 
+# TODO: delete or move to char.jl
+first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
+
+## overload methods for efficiency ##
+
+function isvalid(s::String, i::Int)
+    @boundscheck checkbounds(s, i)
+    return thisind(s, i) == i
+end
+isvalid(s::String, i::Integer) = isvalid(s, Int(i))
+
 function search(s::String, c::Char, i::Integer = 1)
     if i < 1 || i > sizeof(s)
         i == sizeof(s) + 1 && return 0
@@ -331,11 +287,11 @@ function search(s::String, c::Char, i::Integer = 1)
     @inbounds if is_valid_continuation(codeunit(s,i))
         throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
     end
-    c < Char(0x80) && return search(s, c%UInt8, i)
+    c ≤ '\x7f' && return search(s, c % UInt8, i)
     while true
         i = search(s, first_utf8_byte(c), i)
-        (i==0 || s[i] == c) && return i
-        i = next(s,i)[2]
+        (i == 0 || s[i] == c) && return i
+        i = next(s, i)[2]
     end
 end
 
@@ -361,12 +317,12 @@ function search(a::ByteArray, b::Char, i::Integer = 1)
 end
 
 function rsearch(s::String, c::Char, i::Integer = sizeof(s))
-    c < Char(0x80) && return rsearch(s, c%UInt8, i)
+    c ≤ '\x7f' && return rsearch(s, c % UInt8, i)
     b = first_utf8_byte(c)
     while true
         i = rsearch(s, b, i)
-        (i==0 || s[i] == c) && return i
-        i = prevind(s,i)
+        (i == 0 || s[i] == c) && return i
+        i = prevind(s, i)
     end
 end
 
@@ -411,62 +367,15 @@ function string(a::String...)
 end
 
 # UTF-8 encoding length of a character
-function codelen(d::Char)
-    c = UInt32(d)
-    if c < 0x80
-        return 1
-    elseif c < 0x800
-        return 2
-    elseif c < 0x10000
-        return 3
-    elseif c < 0x110000
-        return 4
-    end
-    return 3  # '\ufffd'
-end
+# TODO: delete or move to char.jl
+codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3)
 
 function string(a::Union{String,Char}...)
-    n = 0
-    for d in a
-        if isa(d,Char)
-            n += codelen(d::Char)
-        else
-            n += sizeof(d::String)
+    sprint() do io
+        for x in a
+            write(io, x)
         end
     end
-    out = _string_n(n)
-    offs = 1
-    p = pointer(out)
-    for d in a
-        if isa(d,Char)
-            c = UInt32(d::Char)
-            if c < 0x80
-                unsafe_store!(p, c%UInt8, offs); offs += 1
-            elseif c < 0x800
-                unsafe_store!(p, (( c >> 6          ) | 0xC0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            elseif c < 0x10000
-                unsafe_store!(p, (( c >> 12         ) | 0xE0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 6)  & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            elseif c < 0x110000
-                unsafe_store!(p, (( c >> 18         ) | 0xF0)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (((c >> 6)  & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-                unsafe_store!(p, (( c        & 0x3F ) | 0x80)%UInt8, offs); offs += 1
-            else
-                # '\ufffd'
-                unsafe_store!(p, 0xef, offs); offs += 1
-                unsafe_store!(p, 0xbf, offs); offs += 1
-                unsafe_store!(p, 0xbd, offs); offs += 1
-            end
-        else
-            l = sizeof(d::String)
-            unsafe_copy!(pointer(out,offs), pointer(d::String), l)
-            offs += l
-        end
-    end
-    return out
 end
 
 function repeat(s::String, r::Integer)
diff --git a/base/strings/substring.jl b/base/strings/substring.jl
index d1bf33e4123fb..2c75ed1c49444 100644
--- a/base/strings/substring.jl
+++ b/base/strings/substring.jl
@@ -22,13 +22,18 @@ julia> SubString("abc", 2)
 struct SubString{T<:AbstractString} <: AbstractString
     string::T
     offset::Int
-    endof::Int
+    ncodeunits::Int
 
     function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString
-        i > j && return new(s, i - 1, 0) # always allow i > j as it is consistent with getindex
-        isvalid(s, i) || throw(BoundsError(s, i))
-        isvalid(s, j) || throw(BoundsError(s, j))
-        new(s, i-1, j-i+1)
+        i ≤ j || return new(s, i-1, 0)
+        @boundscheck begin
+            checkbounds(s, i:j)
+            @inbounds isvalid(s, i) ||
+                throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s, i)))
+            @inbounds isvalid(s, j) ||
+                throw(UnicodeError(UTF_ERR_INVALID_INDEX, j, codeunit(s, j)))
+        end
+        return new(s, i-1, nextind(s,j)-i)
     end
 end
 
@@ -37,11 +42,8 @@ SubString(s::AbstractString, i::Integer, j::Integer=endof(s)) = SubString(s, Int
 SubString(s::AbstractString, r::UnitRange{<:Integer}) = SubString(s, first(r), last(r))
 
 function SubString(s::SubString, i::Int, j::Int)
-    # always allow i > j as it is consistent with getindex
-    i > j && return SubString(s.string, s.offset + i, s.offset + j)
-    i >= 1 || throw(BoundsError(s, i))
-    j <= endof(s) || throw(BoundsError(s, j))
-    SubString(s.string, s.offset + i, s.offset + j)
+    @boundscheck i ≤ j && checkbounds(s, i:j)
+    SubString(s.string, s.offset+i, s.offset+j)
 end
 
 SubString(s::AbstractString) = SubString(s, 1, endof(s))
@@ -50,78 +52,56 @@ SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))
 convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} =
     SubString(convert(S, s))
 
-String(p::SubString{String}) =
-    unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
+String(s::SubString{String}) = unsafe_string(pointer(s.string, s.offset+1), s.ncodeunits)
 
-sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1
+ncodeunits(s::SubString) = s.ncodeunits
+codeunit(s::SubString) = codeunit(s.string)
+length(s::SubString) = length(s.string, s.offset+1, s.offset+s.ncodeunits)
 
-# TODO: length(s::SubString) = ??
-# default implementation will work but it's slow
-# can this be delegated efficiently somehow?
-# that may require additional string interfaces
-function length(s::SubString{String})
-    return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
-                                      pointer(s), nextind(s, s.endof) - 1))
+function codeunit(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds return codeunit(s.string, s.offset + i)
 end
 
-function next(s::SubString, i::Int)
-    if i < 1 || i > s.endof
-        throw(BoundsError(s, i))
-    end
-    c, i = next(s.string, i+s.offset)
-    c, i-s.offset
+function next(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds c, i = next(s.string, s.offset + i)
+    return c, i - s.offset
 end
 
-function getindex(s::SubString, i::Int)
-    if i < 1 || i > s.endof
-        throw(BoundsError(s, i))
-    end
-    getindex(s.string, i+s.offset)
+function getindex(s::SubString, i::Integer)
+    @boundscheck checkbounds(s, i)
+    @inbounds return getindex(s.string, s.offset + i)
 end
 
-endof(s::SubString) = s.endof
-
 function isvalid(s::SubString, i::Integer)
-    return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i)
+    @boundscheck checkbounds(s, i)
+    @inbounds return isvalid(s.string, s.offset + i)
 end
 
-function thisind(s::SubString{String}, i::Integer)
-    j = Int(i)
-    j < start(s) && return 0
-    n = ncodeunits(s)
-    j > n && return n + 1
-    offset = s.offset
-    str = s.string
-    j += offset
-    @inbounds while j > offset && is_valid_continuation(codeunit(str, j))
-        j -= 1
-    end
-    j - offset
-end
-
-nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset
-prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
-
-function getindex(s::AbstractString, r::UnitRange{Int})
-    checkbounds(s, r) || throw(BoundsError(s, r))
-    SubString(s, first(r), last(r))
-end
+thisind(s::SubString, i::Integer) = thisind(s.string, s.offset + i) - s.offset
+nextind(s::SubString, i::Integer) = nextind(s.string, s.offset + i) - s.offset
+prevind(s::SubString, i::Integer) = prevind(s.string, s.offset + i) - s.offset
 
 function cmp(a::SubString{String}, b::SubString{String})
     na = sizeof(a)
     nb = sizeof(b)
     c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
-              pointer(a), pointer(b), min(na,nb))
-    c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb)
+              pointer(a), pointer(b), min(na, nb))
+    return c < 0 ? -1 : c > 0 ? +1 : cmp(na, nb)
 end
 
 # don't make unnecessary copies when passing substrings to C functions
 cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s
 cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s
+
 function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8}
     convert(Ptr{R}, pointer(s.string)) + s.offset
 end
 
+pointer(x::SubString{String}) = pointer(x.string) + x.offset
+pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
+
 """
     reverse(s::AbstractString) -> AbstractString
 
@@ -156,53 +136,3 @@ function reverse(s::Union{String,SubString{String}})::String
         end
     end
 end
-
-"""
-    reverseind(v, i)
-
-Given an index `i` in [`reverse(v)`](@ref), return the corresponding index in `v` so that
-`v[reverseind(v,i)] == reverse(v)[i]`. (This can be nontrivial in cases where `v` contains
-non-ASCII characters.)
-
-# Examples
-```jldoctest
-julia> r = reverse("Julia")
-"ailuJ"
-
-julia> for i in 1:length(r)
-           print(r[reverseind("Julia", i)])
-       end
-Julia
-```
-"""
-reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
-
-"""
-    repeat(s::AbstractString, r::Integer)
-
-Repeat a string `r` times. This can equivalently be accomplished by calling [`s^r`](@ref ^).
-
-# Examples
-```jldoctest
-julia> repeat("ha", 3)
-"hahaha"
-```
-"""
-repeat(s::AbstractString, r::Integer) = repeat(convert(String, s), r)
-
-"""
-    ^(s::Union{AbstractString,Char}, n::Integer)
-
-Repeat a string or character `n` times.
-The [`repeat`](@ref) function is an alias to this operator.
-
-# Examples
-```jldoctest
-julia> "Test "^3
-"Test Test Test "
-```
-"""
-(^)(s::Union{AbstractString,Char}, r::Integer) = repeat(s,r)
-
-pointer(x::SubString{String}) = pointer(x.string) + x.offset
-pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
diff --git a/base/strings/utf8proc.jl b/base/strings/utf8proc.jl
index cf30ec5b3aa6f..0c646b63c558d 100644
--- a/base/strings/utf8proc.jl
+++ b/base/strings/utf8proc.jl
@@ -3,7 +3,10 @@
 # Various Unicode functionality from the utf8proc library
 module UTF8proc
 
-import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase, titlecase
+import Base:
+    show, ==, hash, string, Symbol, isless, length, eltype, start, next,
+    done, convert, isvalid, lowercase, uppercase, titlecase,
+    MalformedCharError, ismalformed
 
 export isgraphemebreak, category_code, category_abbrev, category_string
 
@@ -118,7 +121,9 @@ const category_strings = [
     "Other, control",
     "Other, format",
     "Other, surrogate",
-    "Other, private use"
+    "Other, private use",
+    "Invalid, too high",
+    "Malformed, bad data",
 ]
 
 const UTF8PROC_STABLE    = (1<<1)
@@ -155,10 +160,26 @@ end
 
 utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
 
-function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+function normalize_string(
+    s::AbstractString;
+    stable::Bool=false,
+    compat::Bool=false,
+    compose::Bool=true,
+    decompose::Bool=false,
+    stripignore::Bool=false,
+    rejectna::Bool=false,
+    newline2ls::Bool=false,
+    newline2ps::Bool=false,
+    newline2lf::Bool=false,
+    stripcc::Bool=false,
+    casefold::Bool=false,
+    lump::Bool=false,
+    stripmark::Bool=false,
+)
     flags = 0
     stable && (flags = flags | UTF8PROC_STABLE)
     compat && (flags = flags | UTF8PROC_COMPAT)
+    # TODO: error if compose & decompose?
     if decompose
         flags = flags | UTF8PROC_DECOMPOSE
     elseif compose
@@ -253,7 +274,10 @@ julia> textwidth('❤')
 2
 ```
 """
-textwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
+function textwidth(c::Char)
+    ismalformed(c) && (c = '\ufffd')
+    Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
+end
 
 """
     textwidth(s::AbstractString)
@@ -268,17 +292,29 @@ julia> textwidth("March")
 """
 textwidth(s::AbstractString) = mapreduce(textwidth, +, 0, s)
 
-lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
-uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
-titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
+lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
+    Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
+uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
+    Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
+titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
+    Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
 
 ############################################################################
 
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
-category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c)
+function category_code(c::Char)
+    ismalformed(c) && return Cint(31)
+    (u = UInt32(c)) ≤ 0x10ffff || return Cint(30)
+    ccall(:utf8proc_category, Cint, (UInt32,), u)
+end
 
 # more human-readable representations of the category code
-category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
+function category_abbrev(c)
+    ismalformed(c) && return "Ma"
+    (u = UInt32(c)) ≤ 0x10ffff || return "In"
+    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), u))
+end
+
 category_string(c) = category_strings[category_code(c)+1]
 
 """
@@ -318,7 +354,7 @@ julia> islower('❤')
 false
 ```
 """
-islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
+islower(c::Char) = category_code(c) == UTF8PROC_CATEGORY_LL
 
 # true for Unicode upper and mixed case
 
@@ -342,8 +378,8 @@ false
 ```
 """
 function isupper(c::Char)
-    ccode = category_code(c)
-    return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT
+    cat = category_code(c)
+    cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
 end
 
 """
@@ -363,7 +399,7 @@ julia> isdigit('α')
 false
 ```
 """
-isdigit(c::Char)  = ('0' <= c <= '9')
+isdigit(c::Char) = '0' <= c <= '9'
 
 """
     isalpha(c::Char) -> Bool
@@ -384,7 +420,7 @@ julia> isalpha('9')
 false
 ```
 """
-isalpha(c::Char)  = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO)
+isalpha(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO
 
 """
     isnumber(c::Char) -> Bool
@@ -405,7 +441,7 @@ julia> isnumber('❤')
 false
 ```
 """
-isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO)
+isnumber(c::Char) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO
 
 """
     isalnum(c::Char) -> Bool
@@ -427,9 +463,9 @@ true
 ```
 """
 function isalnum(c::Char)
-    ccode = category_code(c)
-    return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
-           (UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
+    cat = category_code(c)
+    UTF8PROC_CATEGORY_LU <= cat <= UTF8PROC_CATEGORY_LO ||
+    UTF8PROC_CATEGORY_ND <= cat <= UTF8PROC_CATEGORY_NO
 end
 
 # following C++ only control characters from the Latin-1 subset return true
@@ -449,7 +485,7 @@ julia> iscntrl('a')
 false
 ```
 """
-iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
+iscntrl(c::Char) = c <= '\x1f' || '\x7f' <= c <= '\u9f'
 
 """
     ispunct(c::Char) -> Bool
@@ -469,7 +505,7 @@ julia> ispunct(';')
 true
 ```
 """
-ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO)
+ispunct(c::Char) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO
 
 # \u85 is the Unicode Next Line (NEL) character
 
@@ -495,7 +531,9 @@ julia> isspace('\\x20')
 true
 ```
 """
-@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
+@inline isspace(c::Char) =
+    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
+    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
 
 """
     isprint(c::Char) -> Bool
@@ -511,7 +549,7 @@ julia> isprint('A')
 true
 ```
 """
-isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS)
+isprint(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS
 
 # true in principal if a printer would use ink
 
@@ -531,19 +569,26 @@ julia> isgraph('A')
 true
 ```
 """
-isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO)
+isgraph(c::Char) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO
 
 ############################################################################
 # iterators for grapheme segmentation
 
 isgraphemebreak(c1::Char, c2::Char) =
+    ismalformed(c1) || ismalformed(c2) ||
     ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
 
 # Stateful grapheme break required by Unicode-9 rules: the string
 # must be processed in sequence, with state initialized to Ref{Int32}(0).
 # Requires utf8proc v2.0 or later.
-isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) =
-    ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state)
+function isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char)
+    if ismalformed(c1) || ismalformed(c2)
+        state[] = 0
+        return true
+    end
+    ccall(:utf8proc_grapheme_break_stateful, Bool,
+          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
+end
 
 struct GraphemeIterator{S<:AbstractString}
     s::S # original string (for generation of SubStrings)
@@ -563,7 +608,7 @@ eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
 eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
 
 function length(g::GraphemeIterator)
-    c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
+    c0 = typemax(Char)
     n = 0
     state = Ref{Int32}(0)
     for c in g.s
diff --git a/base/strings/util.jl b/base/strings/util.jl
index db230a16da0c6..8bf3c8e2aadc9 100644
--- a/base/strings/util.jl
+++ b/base/strings/util.jl
@@ -58,10 +58,12 @@ function endswith(a::AbstractString, b::AbstractString)
 end
 endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars
 
-startswith(a::String, b::String) =
-    (sizeof(a) >= sizeof(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0)
-startswith(a::Vector{UInt8}, b::Vector{UInt8}) =
-    (length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0)
+# FIXME: check that end of `b` doesn't match a partial character in `a`
+startswith(a::String, b::String) = sizeof(a) ≥ sizeof(b) &&
+    ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, sizeof(b)) == 0
+
+startswith(a::Vector{UInt8}, b::Vector{UInt8}) = length(a) ≥ length(b) &&
+    ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0
 
 # TODO: fast endswith
 
@@ -88,15 +90,9 @@ julia> chop(a, 5, 5)
 ""
 ```
 """
-function chop(s::AbstractString, head::Integer, tail::Integer)
-    # negative values of head/tail will throw error in nextind/prevind
-    headidx = head == 0 ? start(s) : nextind(s, start(s), head)
-    tailidx = tail == 0 ? endof(s) : prevind(s, endof(s), tail)
-    SubString(s, headidx, tailidx)
-end
-
-# no head/tail version left for performance reasons
 chop(s::AbstractString) = SubString(s, start(s), prevind(s, endof(s)))
+chop(s::AbstractString, head::Integer, tail::Integer) =
+    SubString(s, nextind(s, start(s), head), prevind(s, endof(s), tail))
 
 """
     chomp(s::AbstractString)
@@ -127,17 +123,6 @@ function chomp(s::String)
     end
 end
 
-# NOTE: use with caution -- breaks the immutable string convention!
-# TODO: this is hard to provide with the new representation
-#function chomp!(s::String)
-#    if !isempty(s) && codeunit(s,sizeof(s)) == 0x0a
-#        n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2
-#        ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n)
-#    end
-#    return s
-#end
-chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types
-
 const _default_delims = [' ','\t','\n','\v','\f','\r']
 
 """
@@ -449,6 +434,7 @@ replace(s::AbstractString, pat, f) = replace_new(String(s), pat, f, typemax(Int)
 # replace(s::AbstractString, pat, f, count::Integer=typemax(Int)) =
 #     replace(String(s), pat, f, count)
 
+# TODO: allow transform as the first argument to replace?
 
 # hex <-> bytes conversion
 
@@ -550,7 +536,8 @@ end
 # check for pure ASCII-ness
 
 function ascii(s::String)
-    for (i, b) in enumerate(Vector{UInt8}(s))
+    for i = 1:sizeof(s)
+        b = codeunit(s,i)
         b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))"))
     end
     return s
diff --git a/src/ast.c b/src/ast.c
index d54e5581fab89..fba225b231feb 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -557,7 +557,17 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
         return (jl_value_t*)ex;
     }
     if (iscprim(e) && cp_class((cprim_t*)ptr(e)) == fl_ctx->wchartype) {
-        return jl_box32(jl_char_type, *(int32_t*)cp_data((cprim_t*)ptr(e)));
+        uint32_t c, u = *(uint32_t*)cp_data((cprim_t*)ptr(e));
+        if (u < 0x80) {
+            c = u << 24;
+        } else {
+            c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
+                ((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000);
+            c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
+                u < 0x00010000 ? (c <<  8) | 0xe0808000 :
+                                 (c <<  0) | 0xf0808080 ;
+        }
+        return jl_box_char(c);
     }
     if (iscvalue(e) && cv_class((cvalue_t*)ptr(e)) == jl_ast_ctx(fl_ctx)->jvtype) {
         return *(jl_value_t**)cv_data((cvalue_t*)ptr(e));
diff --git a/src/datatype.c b/src/datatype.c
index 41f5cdb62ac70..edf94df39591c 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -640,7 +640,6 @@ SIBOX_FUNC(int16,  int16_t, 1)
 SIBOX_FUNC(int32,  int32_t, 1)
 UIBOX_FUNC(uint16, uint16_t, 1)
 UIBOX_FUNC(uint32, uint32_t, 1)
-UIBOX_FUNC(char,   uint32_t, 1)
 UIBOX_FUNC(ssavalue, size_t, 1)
 UIBOX_FUNC(slotnumber, size_t, 1)
 #ifdef _P64
@@ -651,6 +650,17 @@ SIBOX_FUNC(int64,  int64_t, 2)
 UIBOX_FUNC(uint64, uint64_t, 2)
 #endif
 
+static jl_value_t *boxed_char_cache[128];
+JL_DLLEXPORT jl_value_t *jl_box_char(uint32_t x)
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    if (0 < (int32_t)x)
+        return boxed_char_cache[x >> 24];
+    jl_value_t *v = jl_gc_alloc(ptls, sizeof(void*), jl_char_type);
+    *(uint32_t*)jl_data_ptr(v) = x;
+    return v;
+}
+
 static jl_value_t *boxed_int8_cache[256];
 JL_DLLEXPORT jl_value_t *jl_box_int8(int8_t x)
 {
@@ -684,14 +694,16 @@ void jl_init_int32_int64_cache(void)
 void jl_init_box_caches(void)
 {
     int64_t i;
+    for(i=0; i < 128; i++) {
+        boxed_char_cache[i] = jl_permbox32(jl_char_type, i << 24);
+    }
     for(i=0; i < 256; i++) {
-        boxed_int8_cache[i]  = jl_permbox8(jl_int8_type, i);
+        boxed_int8_cache[i] = jl_permbox8(jl_int8_type, i);
     }
     for(i=0; i < NBOX_C; i++) {
         boxed_int16_cache[i]  = jl_permbox16(jl_int16_type, i-NBOX_C/2);
         boxed_uint16_cache[i] = jl_permbox16(jl_uint16_type, i);
         boxed_uint32_cache[i] = jl_permbox32(jl_uint32_type, i);
-        boxed_char_cache[i]   = jl_permbox32(jl_char_type, i);
         boxed_uint64_cache[i] = jl_permbox64(jl_uint64_type, i);
     }
 }
diff --git a/src/jl_uv.c b/src/jl_uv.c
index 77719693eb943..4753655bbdd9d 100644
--- a/src/jl_uv.c
+++ b/src/jl_uv.c
@@ -490,10 +490,21 @@ JL_DLLEXPORT void jl_uv_putb(uv_stream_t *stream, uint8_t b)
     jl_uv_puts(stream, (char*)&b, 1);
 }
 
-JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t wchar)
+JL_DLLEXPORT void jl_uv_putc(uv_stream_t *stream, uint32_t c)
 {
     char s[4];
-    jl_uv_puts(stream, s, u8_wc_toutf8(s, wchar));
+    int n = 1;
+    s[0] = c >> 24;
+    if ((s[1] = c >> 16)) {
+        n++;
+        if ((s[2] = c >> 8)) {
+            n++;
+            if ((s[3] = c)) {
+                n++;
+            }
+        }
+    }
+    jl_uv_puts(stream, s, n);
 }
 
 extern int vasprintf(char **str, const char *fmt, va_list ap);
diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl
index 916834c42bf3b..97d46e237343e 100644
--- a/stdlib/Test/src/Test.jl
+++ b/stdlib/Test/src/Test.jl
@@ -1396,8 +1396,11 @@ with string types besides the standard `String` type.
 struct GenericString <: AbstractString
     string::AbstractString
 end
-Base.endof(s::GenericString) = endof(s.string)
-Base.next(s::GenericString, i::Int) = next(s.string, i)
+Base.ncodeunits(s::GenericString) = ncodeunits(s.string)
+Base.codeunit(s::GenericString) = codeunit(s.string)
+Base.codeunit(s::GenericString, i::Integer) = codeunit(s.string, i)
+Base.isvalid(s::GenericString, i::Integer) = isvalid(s.string, i)
+Base.next(s::GenericString, i::Integer) = next(s.string, i)
 Base.reverse(s::GenericString) = GenericString(reverse(s.string))
 Base.reverse(s::SubString{GenericString}) =
     GenericString(typeof(s.string)(reverse(String(s))))
diff --git a/test/char.jl b/test/char.jl
index c40f60de3be23..85b2acf5385ef 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -198,3 +198,25 @@ end
 
 @test sprint(show, "text/plain", '$') == "'\$': ASCII/Unicode U+0024 (category Sc: Symbol, currency)"
 @test repr('$') == "'\$'"
+
+@testset "read incomplete character at end of stream or file" begin
+    local file = tempname()
+    local iob = IOBuffer([0xf0])
+    local bytes(c::Char) = Vector{UInt8}(string(c))
+    @test bytes(read(iob, Char)) == [0xf0]
+    @test eof(iob)
+    try
+        write(file, 0xf0)
+        open(file) do io
+            @test bytes(read(io, Char)) == [0xf0]
+            @test eof(io)
+        end
+        let io = Base.Filesystem.open(file, Base.Filesystem.JL_O_RDONLY)
+            @test bytes(read(io, Char)) == [0xf0]
+            @test eof(io)
+            close(io)
+        end
+   finally
+        rm(file, force=true)
+    end
+end
diff --git a/test/intfuncs.jl b/test/intfuncs.jl
index 779ce240add9a..062d1103c530f 100644
--- a/test/intfuncs.jl
+++ b/test/intfuncs.jl
@@ -134,7 +134,7 @@ end
     @test base(2, 5, 7) == "0000101"
 
     @test bitstring(Int16(3)) == "0000000000000011"
-    @test bitstring('3') == "00000000000000000000000000110011"
+    @test bitstring('3') == "00110011000000000000000000000000"
     @test bitstring(1035) == (Int == Int32 ? "00000000000000000000010000001011" :
         "0000000000000000000000000000000000000000000000000000010000001011")
     @test bitstring(Int128(3)) == "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011"
diff --git a/test/lineedit.jl b/test/lineedit.jl
index 51e2a692025f9..9da81487bd320 100644
--- a/test/lineedit.jl
+++ b/test/lineedit.jl
@@ -16,8 +16,8 @@ function new_state()
     LineEdit.init_state(term, ModalInterface([Prompt("test> ")]))
 end
 
-charseek(buf, i) = seek(buf, Base.unsafe_chr2ind(content(buf), i+1)-1)
-charpos(buf, pos=position(buf)) = Base.unsafe_ind2chr(content(buf), pos+1)-1
+charseek(buf, i) = seek(buf, chr2ind(content(buf), i+1)-1)
+charpos(buf, pos=position(buf)) = ind2chr(content(buf), pos+1)-1
 
 function transform!(f, s, i = -1) # i is char-based (not bytes) buffer position
     buf = buffer(s)
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index 384da5d8a70f3..cbd26b89df3ce 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -99,14 +99,14 @@ end
 end
 
 @testset "issue #7248" begin
-    @test_throws BoundsError ind2chr("hello", -1)
-    @test_throws BoundsError chr2ind("hello", -1)
-    @test_throws BoundsError ind2chr("hellø", -1)
-    @test_throws BoundsError chr2ind("hellø", -1)
-    @test_throws BoundsError ind2chr("hello", 10)
-    @test_throws BoundsError chr2ind("hello", 10)
-    @test_throws BoundsError ind2chr("hellø", 10)
-    @test_throws BoundsError chr2ind("hellø", 10)
+    @test ind2chr("hello", -1) == -1
+    @test chr2ind("hello", -1) == -1
+    @test ind2chr("hellø", -1) == -1
+    @test chr2ind("hellø", -1) == -1
+    @test ind2chr("hello", 10) == 10
+    @test chr2ind("hello", 10) == 10
+    @test ind2chr("hellø", 10) == 9
+    @test chr2ind("hellø", 10) == 11
     @test_throws BoundsError checkbounds("hello", 0)
     @test_throws BoundsError checkbounds("hello", 6)
     @test_throws BoundsError checkbounds("hello", 0:3)
@@ -127,7 +127,6 @@ end
     @test SubString("hellø", 1, 5)[10:9] == ""
     @test SubString("hellø", 1, 0)[10:9] == ""
     @test SubString("", 1, 0)[10:9] == ""
-
     @test_throws BoundsError SubString("", 1, 6)
     @test_throws BoundsError SubString("", 1, 1)
 end
@@ -143,8 +142,8 @@ end
     @test get(utf8_str, -1, 'X') == 'X'
     @test get(utf8_str, 1000, 'X') == 'X'
 
-    # Test that indexing into the middle of a character returns the default
-    @test get(utf8_str, 2, 'X') == 'X'
+    # Test that indexing into the middle of a character throws
+    @test_throws UnicodeError get(utf8_str, 2, 'X')
 end
 
 #=
@@ -172,8 +171,10 @@ end
 
 # make sure substrings do not accept code unit if it is not start of codepoint
 let s = "x\u0302"
+    @test s[1:2] == s
+    @test_throws BoundsError s[0:3]
+    @test_throws BoundsError s[1:4]
     @test_throws UnicodeError s[1:3]
-    @test s[1:2]==s
 end
 
 @testset "issue #9781" begin
@@ -204,8 +205,15 @@ struct tstStringType <: AbstractString
 end
 @testset "AbstractString functions" begin
     tstr = tstStringType(Vector{UInt8}("12"))
-    @test_throws ErrorException endof(tstr)
-    @test_throws ErrorException next(tstr, Bool(1))
+    @test_throws MethodError ncodeunits(tstr)
+    @test_throws MethodError codeunit(tstr)
+    @test_throws MethodError codeunit(tstr, 1)
+    @test_throws MethodError codeunit(tstr, true)
+    @test_throws MethodError isvalid(tstr, 1)
+    @test_throws MethodError isvalid(tstr, true)
+    @test_throws MethodError next(tstr, 1)
+    @test_throws MethodError next(tstr, true)
+    @test_throws MethodError endof(tstr)
 
     gstr = GenericString("12")
     @test string(gstr) isa GenericString
@@ -224,18 +232,19 @@ end
     @test done(eachindex("foobar"),7)
     @test eltype(Base.EachStringIndex) == Int
     @test map(uppercase, "foó") == "FOÓ"
-    @test chr2ind("fóobar",3) == 4
-
-    @test Symbol(gstr)==Symbol("12")
+    @test chr2ind("fóobar", 3) == 4
 
-    @test_throws ErrorException sizeof(gstr)
+    @test Symbol(gstr) == Symbol("12")
 
-    @test length(GenericString(""))==0
+    @test sizeof(gstr) == 2
+    @test ncodeunits(gstr) == 2
+    @test length(gstr) == 2
+    @test length(GenericString("")) == 0
 
     @test nextind(1:1, 1) == 2
     @test nextind([1], 1) == 2
 
-    @test ind2chr(gstr,2)==2
+    @test ind2chr(gstr, 2) == 2
 
     # tests promote_rule
     let svec = [s"12", GenericString("12"), SubString("123", 1, 2)]
@@ -463,8 +472,8 @@ end
     foobar(ch) = Char(0xd800)
     foobaz(ch) = reinterpret(Char, typemax(UInt32))
     @test_throws ArgumentError map(foomap, GenericString(str))
-    @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17]))
-    @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17]))
+    @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[length(str)]))
+    @test map(foobaz, GenericString(str)) == String(repeat([0xff], outer=[4*length(str)]))
 
     @test "a".*["b","c"] == ["ab","ac"]
     @test ["b","c"].*"a" == ["ba","ca"]
@@ -488,7 +497,7 @@ end
     @test_throws ArgumentError ascii(GenericString("Hello, ∀"))
 end
 @testset "issue #17271: endof() doesn't throw an error even with invalid strings" begin
-    @test endof(String(b"\x90")) == 0
+    @test endof(String(b"\x90")) == 1
     @test endof(String(b"\xce")) == 1
 end
 # issue #17624, missing getindex method for String
@@ -570,7 +579,7 @@ end
                    SubString("123∀α>β:α+1>β123", 4, 18),
                    SubString(s"123∀α>β:α+1>β123", 4, 18)]
         for s in strs
-            @test thisind(s, -2) == 0
+            @test thisind(s, -2) == -2
             @test thisind(s, 0) == 0
             @test thisind(s, 1) == 1
             @test thisind(s, 2) == 1
@@ -581,13 +590,13 @@ end
             @test thisind(s, 15) == 15
             @test thisind(s, 16) == 15
             @test thisind(s, 17) == 17
-            @test thisind(s, 30) == 17
+            @test thisind(s, 30) == 30
         end
     end
 
     let strs = Any["", s"", SubString("123", 2, 1), SubString(s"123", 2, 1)]
         for s in strs, i in -2:2
-            @test thisind(s, i) == (i > 0)
+            @test thisind(s, i) == i
         end
     end
 end
@@ -612,17 +621,18 @@ end
             @test prevind(strs[i], 15, 4) == 10
             @test prevind(strs[i], 15, 10) == 0
             @test prevind(strs[i], 15, 9) == 1
-            @test prevind(strs[i], 15, 10) == 0
             @test prevind(strs[i], 16) == 15
             @test prevind(strs[i], 16, 1) == 15
             @test prevind(strs[i], 16, 2) == 14
-            @test prevind(strs[i], 20) == 15
-            @test prevind(strs[i], 20, 1) == 15
-            @test prevind(strs[i], 20, 10) == 1
-            @test_throws ArgumentError prevind(strs[i], 20, 0)
-
-            @test nextind(strs[i], -1) == 1
-            @test nextind(strs[i], -1, 1) == 1
+            @test prevind(strs[i], 20) == 19
+            @test prevind(strs[i], 20, 1) == 19
+            @test prevind(strs[i], 20, 10) == 7
+            @test prevind(strs[i], 20, 0) == 20
+
+            @test nextind(strs[i], -1) == 0
+            @test nextind(strs[i], -1, 1) == 0
+            @test nextind(strs[i], -1, 2) == 1
+            @test nextind(strs[i], -1, 3) == 4
             @test nextind(strs[i], 0, 2) == 4
             @test nextind(strs[i], 0, 20) == 26
             @test nextind(strs[i], 0, 10) == 15
@@ -643,7 +653,7 @@ end
             @test nextind(strs[i], 15, 1) == 17
             @test nextind(strs[i], 20) == 21
             @test nextind(strs[i], 20, 1) == 21
-            @test_throws ArgumentError nextind(strs[i], 20, 0)
+            @test nextind(strs[i], 20, 0) == 20
 
             for x in -10:20
                 n = p = x
@@ -658,8 +668,8 @@ end
         @test prevind(strs[1], -1) == -2
         @test prevind(strs[1], -1, 1) == -2
 
-        @test prevind(strs[2], -1) == 0
-        @test prevind(strs[2], -1, 1) == 0
+        @test prevind(strs[2], -1) == -2
+        @test prevind(strs[2], -1, 1) == -2
     end
 end
 
@@ -672,7 +682,7 @@ end
     @test first(s, 3) == "∀ϵ≠"
     @test first(s, 4) == "∀ϵ≠0"
     @test first(s, length(s)) == s
-    @test_throws BoundsError first(s, length(s)+1)
+    @test first(s, length(s)+1) == s
     @test_throws ArgumentError last(s, -1)
     @test last(s, 0) == ""
     @test last(s, 1) == "0"
@@ -680,21 +690,13 @@ end
     @test last(s, 3) == "²>0"
     @test last(s, 4) == "ϵ²>0"
     @test last(s, length(s)) == s
-    @test_throws BoundsError last(s, length(s)+1)
+    @test last(s, length(s)+1) == s
 end
 
 @testset "invalid code point" begin
     s = String([0x61, 0xba, 0x41])
     @test !isvalid(s)
-    @test_throws UnicodeError s[2]
-    e = try
-        s[2]
-    catch e
-        e
-    end
-    b = IOBuffer()
-    show(b, e)
-    @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)"
+    @test s[2] == reinterpret(Char, UInt32(0xba) << 24)
 end
 
 @testset "ncodeunits" begin
diff --git a/test/strings/io.jl b/test/strings/io.jl
index 7ee325c252c11..c5bf0f3e2be36 100644
--- a/test/strings/io.jl
+++ b/test/strings/io.jl
@@ -172,8 +172,7 @@ myio = IOBuffer()
 join(myio, "", "", 1)
 @test isempty(take!(myio))
 
-@testset "unescape_chars" begin
-    @test Base.unescape_chars("\\t","t") == "t"
+@testset "unescape_string ArgumentErrors" begin
     @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"xZ"))
     @test_throws ArgumentError unescape_string(IOBuffer(), string('\\',"777"))
 end
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 12dd75a1bd421..00bac71f826b8 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -32,12 +32,21 @@ for idx in 0:1
 end
 
 # Substring provided with invalid end index throws BoundsError
-@test_throws BoundsError SubString("∀", 1, 2)
-@test_throws BoundsError SubString("∀", 1, 3)
+@test_throws UnicodeError SubString("∀", 1, 2)
+@test_throws UnicodeError SubString("∀", 1, 3)
 @test_throws BoundsError SubString("∀", 1, 4)
 
 # Substring provided with invalid start index throws BoundsError
-@test_throws BoundsError SubString("∀∀", 2:4)
+@test SubString("∀∀", 1:1) == "∀"
+@test SubString("∀∀", 1:4) == "∀∀"
+@test SubString("∀∀", 4:4) == "∀"
+@test_throws UnicodeError SubString("∀∀", 1:2)
+@test_throws UnicodeError SubString("∀∀", 1:5)
+@test_throws UnicodeError SubString("∀∀", 2:4)
+@test_throws BoundsError SubString("∀∀", 0:1)
+@test_throws BoundsError SubString("∀∀", 0:4)
+@test_throws BoundsError SubString("∀∀", 1:7)
+@test_throws BoundsError SubString("∀∀", 4:7)
 
 # tests for SubString of more than one multibyte `Char` string
 # we are consistent with `getindex` for `String`
@@ -46,10 +55,12 @@ for idx in [0, 1, 4]
     @test SubString("∀∀", 4, idx) == "∀∀"[4:idx]
 end
 
-# second index beyond endof("∀∀")
-for idx in 5:8
+# index beyond endof("∀∀")
+for idx in [2:3; 5:6]
+    @test_throws UnicodeError SubString("∀∀", 1, idx)
+end
+for idx in 7:8
     @test_throws BoundsError SubString("∀∀", 1, idx)
-    @test_throws BoundsError SubString("∀∀", 4, idx)
 end
 
 let str="tempus fugit"              #length(str)==12
@@ -65,13 +76,13 @@ let str="tempus fugit"              #length(str)==12
     ss=SubString(str,1:0)
     @test length(ss)==0
 
-    @test_throws BoundsError SubString(str,14,20)  #start indexing beyond source string length
-    @test_throws BoundsError SubString(str,10,16)  #end indexing beyond source string length
+    @test_throws BoundsError SubString(str, 14, 20)  #start indexing beyond source string length
+    @test_throws BoundsError SubString(str, 10, 16)  #end indexing beyond source string length
 
     @test_throws BoundsError SubString("", 1, 4)  #empty source string
     @test_throws BoundsError SubString("", 1, 1)  #empty source string, identical start and end index
     @test_throws BoundsError SubString("", 10, 12)
-    @test SubString("",12,10) == ""
+    @test SubString("", 12, 10) == ""
 end
 
 @test SubString("foobar", big(1), big(3)) == "foo"
@@ -83,7 +94,7 @@ let str = "aa\u2200\u2222bb"
     write(b, u)
     @test String(take!(b)) == "\u2200\u2222"
 
-    @test_throws BoundsError SubString(str, 4, 5)
+    @test_throws UnicodeError SubString(str, 4, 5)
     @test_throws BoundsError next(u, 0)
     @test_throws BoundsError next(u, 7)
     @test_throws BoundsError getindex(u, 0)
@@ -147,64 +158,69 @@ end
 @test ismatch(Regex(""), SubString("",1,0))
 
 # isvalid(), chr2ind() and ind2chr() for SubString{String}
-let ss, s="lorem ipsum",
-    sdict=Dict(SubString(s,1,11)=>s,
-               SubString(s,1,6)=>"lorem ",
-               SubString(s,1,0)=>"",
-               SubString(s,2,4)=>"ore",
-               SubString(s,2,11)=>"orem ipsum",
-               SubString(s,15,14)=>""
-               )
-    for (ss,s) in sdict
-        local ss
-        for i in -1:12
-            @test isvalid(ss,i)==isvalid(s,i)
+let s = "lorem ipsum", sdict = Dict(
+    SubString(s, 1, 11)  => "lorem ipsum",
+    SubString(s, 1, 6)   => "lorem ",
+    SubString(s, 1, 0)   => "",
+    SubString(s, 2, 4)   => "ore",
+    SubString(s, 2, 11)  => "orem ipsum",
+    SubString(s, 15, 14) => "",
+)
+    for (ss, s) in sdict
+        @test ncodeunits(ss) == ncodeunits(s)
+        for i in -2:13
+            if 1 ≤ i ≤ ncodeunits(ss)
+                @test isvalid(ss, i) == isvalid(s, i)
+            else
+                @test_throws BoundsError isvalid(ss, i)
+                @test_throws BoundsError isvalid(s, i)
+            end
         end
-    end
-    for (ss,s) in sdict
-        local ss
-        for i in 1:length(ss)
-            @test ind2chr(ss,i)==ind2chr(s,i)
+        for i in 1:ncodeunits(ss)
+            @test ind2chr(ss, i) == ind2chr(s, i)
         end
     end
-    for (ss,s) in sdict
-        local ss
+    for (ss, s) in sdict
+        @test length(ss) == length(s)
         for i in 1:length(ss)
-            @test chr2ind(ss,i)==chr2ind(s,i)
+            @test chr2ind(ss, i) == chr2ind(s, i)
         end
     end
-end #let
+end
 
-#for isvalid(SubString{String})
+# for isvalid(SubString{String})
 let s = "Σx + βz - 2"
-    for i in -1:(length(s)+2)
-        if isvalid(s, i)
-            ss=SubString(s,1,i)
-            # make sure isvalid gives equivalent results for SubString and String
-            @test isvalid(ss,i)==isvalid(s,i)
-        else
-            if i > 0
-                @test_throws BoundsError SubString(s,1,i)
+    for i in -1:ncodeunits(s)+2
+        if checkbounds(Bool, s, i)
+            if isvalid(s, i)
+                ss = SubString(s, 1, i)
+                for j = 1:ncodeunits(ss)
+                    @test isvalid(ss, j) == isvalid(s, j)
+                end
             else
-                @test SubString(s,1,i) == ""
+                @test_throws UnicodeError SubString(s, 1, i)
             end
+        elseif i > 0
+            @test_throws BoundsError SubString(s, 1, i)
+        else
+            @test SubString(s, 1, i) == ""
         end
     end
 end
 
-let ss=SubString("hello",1,5)
-    @test_throws BoundsError ind2chr(ss, -1)
-    @test_throws BoundsError chr2ind(ss, -1)
-    @test_throws BoundsError chr2ind(ss, 10)
-    @test_throws BoundsError ind2chr(ss, 10)
+let ss = SubString("hello", 1, 5)
+    @test ind2chr(ss, -1) == -1
+    @test chr2ind(ss, -1) == -1
+    @test chr2ind(ss, 10) == 10
+    @test ind2chr(ss, 10) == 10
 end
 
 # length(SubString{String}) performance specialization
 let s = "|η(α)-ϕ(κ)| < ε"
-    @test length(SubString(s,1,0))==length(s[1:0])
-    @test length(SubString(s,4,4))==length(s[4:4])
-    @test length(SubString(s,1,7))==length(s[1:7])
-    @test length(SubString(s,4,11))==length(s[4:11])
+    @test length(SubString(s, 1, 0)) == length(s[1:0])
+    @test length(SubString(s, 4, 4)) == length(s[4:4])
+    @test length(SubString(s, 1, 7)) == length(s[1:7])
+    @test length(SubString(s, 4, 11)) == length(s[4:11])
 end
 
 @testset "reverseind" for T in (String, SubString, GenericString)
@@ -217,7 +233,8 @@ end
                 @test c == s[reverseind(s, ri)] == r[ri]
                 s = convert(T, string(prefix, prefix, c, suffix, suffix))
                 pre = convert(T, prefix)
-                sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
+                sb = SubString(s, nextind(pre, endof(pre)),
+                               endof(convert(T, string(prefix, prefix, c, suffix))))
                 r = reverse(sb)
                 ri = search(r, c)
                 @test c == sb[reverseind(sb, ri)] == r[ri]
diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl
index a9db6316d2fa9..c65934217dfb9 100644
--- a/test/unicode/utf8.jl
+++ b/test/unicode/utf8.jl
@@ -1,24 +1,13 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-@testset "cesu8 input" begin
-    let ch = 0x10000
-        for hi = 0xd800:0xdbff
-            for lo = 0xdc00:0xdfff
-                @test String(Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch))
-                ch += 1
-            end
-        end
-    end
-end
-
 @testset "string indexing" begin
     let str = String(b"this is a test\xed\x80")
-        @test next(str, 15) == ('\ufffd', 16)
+        @test next(str, 15) == (reinterpret(Char, 0xed800000), 17)
         @test_throws BoundsError getindex(str, 0:3)
         @test_throws BoundsError getindex(str, 17:18)
         @test_throws BoundsError getindex(str, 2:17)
-        @test_throws UnicodeError getindex(str, 16:17)
-        @test string(Char(0x110000)) == "\ufffd"
+        @test_throws BoundsError getindex(str, 16:17)
+        @test string(Char(0x110000)) == String(b"\xf4\x90\x80\x80")
     end
 end
 
@@ -36,12 +25,12 @@ end
         b"xyz\xf0\x80"      => b"\xf0\x80zyx",
         b"xyz\xf0\x80\x80"  => b"\xf0\x80\x80zyx",
     ]
-        @test_broken reverse(String(s)) == String(r)
+        @test reverse(String(s)) == String(r)
     end
 end
 
 @testset "string convert" begin
     @test String(b"this is a test\xed\x80\x80") == "this is a test\ud000"
-    ## Specifically check UTF-8 string whose lead byte is same as a surrogate
+    # Specifically check UTF-8 string whose lead byte is same as a surrogate
     @test String(b"\xed\x9f\xbf") == "\ud7ff"
 end