From 9d5dc2bf15c14a787770ae50c2638d460b716caf Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 6 Dec 2014 21:30:21 -0500 Subject: [PATCH] add graphemes(s) function to iterate over graphemes (represented by substrings) of a string s --- NEWS.md | 4 ++ base/c.jl | 1 + base/exports.jl | 2 + base/string.jl | 1 - base/utf8.jl | 2 +- base/utf8proc.jl | 62 +++++++++++++++++++++-- deps/libmojibake | 2 +- doc/manual/calling-c-and-fortran-code.rst | 2 + doc/stdlib/base.rst | 8 +++ test/strings.jl | 7 ++- test/unicode.jl | 28 +++++++++- 11 files changed, 110 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7b29596268ca78..926d4f4815a19c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -100,6 +100,8 @@ Library improvements * Efficient `mean` and `median` for ranges ([#8089]). + * `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]). + * Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake to provide uniform cross-platform behavior and up-to-date, locale-independent support for Unicode standards ([#5939]). @@ -1132,4 +1134,6 @@ Too numerous to mention. [#9133]: https://github.com/JuliaLang/julia/issues/9133 [#9144]: https://github.com/JuliaLang/julia/issues/9144 [#9249]: https://github.com/JuliaLang/julia/issues/9249 +[#9261]: https://github.com/JuliaLang/julia/issues/9261 [#9271]: https://github.com/JuliaLang/julia/issues/9271 +[#9294]: https://github.com/JuliaLang/julia/issues/9294 diff --git a/base/c.jl b/base/c.jl index 872414be19bc09..2d863ea6a18df4 100644 --- a/base/c.jl +++ b/base/c.jl @@ -39,6 +39,7 @@ dlclose(p::Ptr) = if p!=C_NULL; ccall(:uv_dlclose,Void,(Ptr{Void},),p); end cfunction(f::Function, r, a) = ccall(:jl_function_ptr, Ptr{Void}, (Any, Any, Any), f, r, a) +typealias Cbool UInt8 if ccall(:jl_is_char_signed, Any, ()) typealias Cchar Int8 else diff --git a/base/exports.jl b/base/exports.jl index f9cad968ff2b0b..73ea2fe8fb96a2 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -119,6 +119,7 @@ export Zip, # Ccall types + Cbool, Cchar, Cdouble, Cfloat, @@ -822,6 +823,7 @@ export escape_string, float32_isvalid, float64_isvalid, + graphemes, hex, hex2bytes, ind2chr, diff --git a/base/string.jl b/base/string.jl index b16d7ccffeba23..80883e6fb53266 100644 --- a/base/string.jl +++ b/base/string.jl @@ -1729,4 +1729,3 @@ pointer{T<:ByteString}(x::SubString{T}, i::Integer) = pointer(x.string.data) + x pointer(x::Union(UTF16String,UTF32String), i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.data)) pointer{T<:Union(UTF16String,UTF32String)}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.data)) - diff --git a/base/utf8.jl b/base/utf8.jl index b8d68aa4d3643c..73c08cc4c0ff6d 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -114,7 +114,7 @@ function getindex(s::UTF8String, r::UnitRange{Int}) if !is_utf8_start(d[i]) i = nextind(s,i) end - if j > endof(s) + if j > length(d) throw(BoundsError()) end j = nextind(s,j)-1 diff --git a/base/utf8proc.jl b/base/utf8proc.jl index 4d177c0441b72c..87763871f4c7fe 100644 --- a/base/utf8proc.jl +++ b/base/utf8proc.jl @@ -1,10 +1,12 @@ # Various Unicode functionality from the utf8proc library module UTF8proc -import Base: show, showcompact, ==, string, symbol, isless +import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype, start, next, done, convert + +export isgraphemebreak # also exported by Base: -export normalize_string, is_valid_char, is_assigned_char, +export normalize_string, graphemes, is_valid_char, is_assigned_char, islower, isupper, isalpha, isdigit, isnumber, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, isblank @@ -60,6 +62,8 @@ const UTF8PROC_CHARBOUND = (1<<11) const UTF8PROC_LUMP = (1<<12) const UTF8PROC_STRIPMARK = (1<<13) +############################################################################ + let const p = Array(Ptr{UInt8}, 1) global utf8proc_map @@ -110,6 +114,8 @@ function normalize_string(s::AbstractString, nf::Symbol) throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD"))) end +############################################################################ + # returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category function category_code(c) uint32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs @@ -118,8 +124,6 @@ end is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN -# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes? - ## libc character class predicates ## islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) @@ -168,4 +172,54 @@ for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", end end +############################################################################ +# iterators for grapheme segmentation + +isgraphemebreak(c1::Char, c2::Char) = Bool(ccall(:utf8proc_grapheme_break, Cbool, (Char, Char), + c1, c2)) + +immutable GraphemeIterator{S<:AbstractString} + s::S # original string (for generation of SubStrings) +end +graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s) + +eltype{S}(::GraphemeIterator{S}) = SubString{S} + +function length(g::GraphemeIterator) + c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) + n = 0 + for c in g.s + n += isgraphemebreak(c0, c) + c0 = c + end + return n +end + +start(g::GraphemeIterator) = start(g.s) +done(g::GraphemeIterator, i) = done(g.s, i) + +function next(g::GraphemeIterator, i) + s = g.s + j = i + c0, k = next(s, i) + while !done(s, k) # loop until next grapheme is s[i:j] + c, ℓ = next(s, k) + isgraphemebreak(c0, c) && break + j = k + k = ℓ + c0 = c + end + return (s[i:j], k) +end + +==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s +hash(g::GraphemeIterator, h::UInt) = hash(g.s, h) +isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s) + +convert{S<:AbstractString}(::Type{S}, g::GraphemeIterator) = convert(S, g.s) + +show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"") + +############################################################################ + end # module diff --git a/deps/libmojibake b/deps/libmojibake index df71da45dfbdf6..86447ad060d6f4 160000 --- a/deps/libmojibake +++ b/deps/libmojibake @@ -1 +1 @@ -Subproject commit df71da45dfbdf68bcc6fd656d1260d609c728ad7 +Subproject commit 86447ad060d6f4edf01f2a64b9598dfeeb6e6f7d diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst index b676638aa97633..9c4ea97c31218f 100644 --- a/doc/manual/calling-c-and-fortran-code.rst +++ b/doc/manual/calling-c-and-fortran-code.rst @@ -223,6 +223,8 @@ Julia type with the same name, prefixed by C. This can help for writing portable **System-independent:** ++------------------------+-------------------+--------------------------------+ +| ``bool`` | ``Cbool`` | ``UInt8`` | +------------------------+-------------------+--------------------------------+ | ``unsigned char`` | ``Cuchar`` | ``UInt8`` | +------------------------+-------------------+--------------------------------+ diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 0288bc8bffb00b..6a21d954db0df7 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -1415,6 +1415,14 @@ Strings For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``. +.. function:: graphemes(s) -> iterator over substrings of s + + Returns an iterator over substrings of ``s`` that correspond to + the graphemes in the string, as defined by Unicode UAX #29. + (Roughly, these are what users would perceive as single characters, + even though they may contain more than one codepoint; for example + a letter combined with an accent mark.) + .. function:: is_valid_ascii(s) -> Bool Returns true if the argument (``ASCIIString``, ``UTF8String``, or byte vector) is valid ASCII, false otherwise. diff --git a/test/strings.jl b/test/strings.jl index 22f7a85210eae6..c33b1e16420035 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -1267,6 +1267,11 @@ Base.done(jt::i9178, n) = (jt.ndone += 1 ; n > 3) Base.next(jt::i9178, n) = (jt.nnext += 1 ; ("$(jt.nnext),$(jt.ndone)", n+1)) @test join(i9178(0,0), ";") == "1,1;2,2;3,3;4,4" +# make sure substrings handle last code unit even if not start of codepoint +let s = "x\u0302" + @test s[1:3] == s +end + # reverseind for T in (ASCIIString, UTF8String, UTF16String, UTF32String) for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") @@ -1288,4 +1293,4 @@ for T in (ASCIIString, UTF8String, UTF16String, UTF32String) end end end -end \ No newline at end of file +end diff --git a/test/unicode.jl b/test/unicode.jl index 4152800f331a03..72658ed349d41e 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -93,9 +93,35 @@ else end # check utf8proc handling of CN category constants - let c_ll = 'β', c_cn = '\u038B' @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL # check codepoint with category code CN @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN end + +# graphemes +let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", + "β","l","a","h", + "b\u0302","l","á","h"]), + ("", UTF8String[]), + ("x\u0302", ["x\u0302"]), + ("\U1d4c1\u0302", ["\U1d4c1\u0302"]), + ("\U1d4c1\u0302\U1d4c1\u0300", ["\U1d4c1\u0302", + "\U1d4c1\u0300"]), + ("x",["x"]), + ("abc",["a","b","c"])) + for T in (utf8,utf16,utf32) + for nf in (:NFC, :NFD) + for (s, g) in grphtest + s_ = T(normalize_string(s, nf)) + g_ = map(s -> normalize_string(s, nf), g) + grph = collect(graphemes(s_)) + @test grph == g_ + @test length(graphemes(s_)) == length(grph) + end + S = [T(normalize_string(s)) for (s,g) in grphtest] + G = map(graphemes, S) + @test map(graphemes, sort!(S)) == sort!(G) + end + end +end