Skip to content

Commit

Permalink
define ncodeunits(c::Char) as fast equivalent of ncodeunits(string(c))
Browse files Browse the repository at this point in the history
There was a non-public `codelen(c::Char)` method which previously did
this. This also replaces internal uses of this with `ncodeunits(c)`.
  • Loading branch information
StefanKarpinski committed Sep 14, 2018
1 parent fc04d73 commit 3b02991
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 6 deletions.
10 changes: 9 additions & 1 deletion base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ Char
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
(::Type{T})(x::T) where {T<:AbstractChar} = x

"""
ncodeunits(c::Char) -> Int
Return the number of code units required to encode a character as UTF-8.
This is the number of bytes which will be printed if the character is written
to an output stream, or `ncodeunits(string(c))` but computed efficiently.
"""
ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient

"""
codepoint(c::AbstractChar) -> Integer
Expand Down Expand Up @@ -197,7 +206,6 @@ hash(x::Char, h::UInt) =
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))

first_utf8_byte(c::Char) = (reinterpret(UInt32, c) >> 24) % UInt8
codelen(c::Char) = 4 - (trailing_zeros(0xff000000 | reinterpret(UInt32, c)) >> 3)

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
Expand Down
2 changes: 1 addition & 1 deletion base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -974,7 +974,7 @@ function skipchars(predicate, io::IO; linecomment=nothing)
if c === linecomment
readline(io)
elseif !predicate(c)
skip(io, -codelen(c))
skip(io, -ncodeunits(c))
break
end
end
Expand Down
4 changes: 2 additions & 2 deletions base/strings/substring.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ function string(a::Union{Char, String, SubString{String}}...)
n = 0
for v in a
if v isa Char
n += codelen(v)
n += ncodeunits(v)
else
n += sizeof(v)
end
Expand All @@ -159,7 +159,7 @@ function string(a::Union{Char, String, SubString{String}}...)
for v in a
if v isa Char
x = bswap(reinterpret(UInt32, v))
for j in 1:codelen(v)
for j in 1:ncodeunits(v)
unsafe_store!(pointer(out, offs), x % UInt8)
offs += 1
x >>= 8
Expand Down
23 changes: 23 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c)
@test_throws MethodError write(IOBuffer(), ASCIIChar('x'))
@test_throws MethodError read(IOBuffer('x'), ASCIIChar)
end

@testset "ncodeunits(::Char)" begin
# valid encodings
@test ncodeunits('\0') == 1
@test ncodeunits('\x1') == 1
@test ncodeunits('\x7f') == 1
@test ncodeunits('\u80') == 2
@test ncodeunits('\uff') == 2
@test ncodeunits('\u7ff') == 2
@test ncodeunits('\u800') == 3
@test ncodeunits('\uffff') == 3
@test ncodeunits('\U10000') == 4
@test ncodeunits('\U10ffff') == 4
# invalid encodings
@test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1
@test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1
@test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2
@test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2
@test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3
@test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3
@test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4
@test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4
end
4 changes: 2 additions & 2 deletions test/iostream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
@test read(file, Char) == 'n'

# test it correctly handles unicode
for (byte,char) in zip(1:4, ('@','߷','','𐋺'))
for (byte, char) in zip(1:4, ('@','߷','','𐋺'))
append_to_file("abcdef$char")
@test Base.codelen(char) == byte
@test ncodeunits(char) == byte
@test !eof(skipchars(isletter, file))
@test read(file, Char) == char
end
Expand Down

0 comments on commit 3b02991

Please sign in to comment.