diff --git a/NEWS.md b/NEWS.md index 9d3e22d9e84332..a84407b0953a64 100644 --- a/NEWS.md +++ b/NEWS.md @@ -477,6 +477,11 @@ This section lists changes that do not have deprecation warnings. a called function `f`, have `f` return the task and put `@async wait(f(...))` within the `@sync` block ([#27164]). + * Regular expressions now default to UCP mode. Escape sequences such as `\w` + will now match based on unicode character properties, e.g. `r"\w+"` will + match `café` (not just `caf`). Add the `a` modifier (e.g. `r"\w+"a`) to + restore the previous behavior ([#27189]). + Library improvements -------------------- diff --git a/base/io.jl b/base/io.jl index ada7cb90599a9f..f2e6ba30cc22cb 100644 --- a/base/io.jl +++ b/base/io.jl @@ -448,28 +448,28 @@ ENDIAN_BOM """ ntoh(x) -Converts the endianness of a value from Network byte order (big-endian) to that used by the Host. +Convert the endianness of a value from Network byte order (big-endian) to that used by the Host. """ ntoh(x) """ hton(x) -Converts the endianness of a value from that used by the Host to Network byte order (big-endian). +Convert the endianness of a value from that used by the Host to Network byte order (big-endian). """ hton(x) """ ltoh(x) -Converts the endianness of a value from Little-endian to that used by the Host. +Convert the endianness of a value from Little-endian to that used by the Host. """ ltoh(x) """ htol(x) -Converts the endianness of a value from that used by the Host to Little-endian. +Convert the endianness of a value from that used by the Host to Little-endian. """ htol(x) diff --git a/base/path.jl b/base/path.jl index 5ed7d99957afb6..f8d490cfd75d84 100644 --- a/base/path.jl +++ b/base/path.jl @@ -27,7 +27,7 @@ if Sys.isunix() elseif Sys.iswindows() const path_separator = "\\" const path_separator_re = r"[/\\]+" - const path_absolute_re = r"^(?:\w+:)?[/\\]" + const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]" const path_directory_re = r"(?:^|[/\\])\.{0,2}$" const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$" const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$" diff --git a/base/pcre.jl b/base/pcre.jl index 0809e68199fad3..5955c3880f6d59 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -49,7 +49,8 @@ const COMPILE_MASK = NO_START_OPTIMIZE | NO_UTF_CHECK | UNGREEDY | - UTF + UTF | + UCP const EXECUTE_MASK = NEWLINE_ANY | diff --git a/base/regex.jl b/base/regex.jl index 32d93275997c84..8ea7efae92347b 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -4,7 +4,7 @@ include("pcre.jl") -const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX +const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP const DEFAULT_MATCH_OPTS = zero(UInt32) mutable struct Regex @@ -40,11 +40,15 @@ end function Regex(pattern::AbstractString, flags::AbstractString) options = DEFAULT_COMPILER_OPTS for f in flags - options |= f=='i' ? PCRE.CASELESS : - f=='m' ? PCRE.MULTILINE : - f=='s' ? PCRE.DOTALL : - f=='x' ? PCRE.EXTENDED : - throw(ArgumentError("unknown regex flag: $f")) + if f == 'a' + options &= ~PCRE.UCP + else + options |= f=='i' ? PCRE.CASELESS : + f=='m' ? PCRE.MULTILINE : + f=='s' ? PCRE.DOTALL : + f=='x' ? PCRE.EXTENDED : + throw(ArgumentError("unknown regex flag: $f")) + end end Regex(pattern, options, DEFAULT_MATCH_OPTS) end @@ -72,8 +76,12 @@ after the ending quote, to change its behaviour: - `s` allows the `.` modifier to match newlines. - `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#` is treated as starting a comment. +- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`, + `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option, + these sequences only match ASCII characters. + -For example, this regex has all three flags enabled: +For example, this regex has the first three flags enabled: ```jldoctest julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n") @@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world") macro r_str(pattern, flags...) Regex(pattern, flags...) end function show(io::IO, re::Regex) - imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED + imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP opts = re.compile_options - if (opts & ~imsx) == DEFAULT_COMPILER_OPTS + if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa) print(io, 'r') print_quoted_literal(io, re.pattern) if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end + if (opts & PCRE.UCP ) == 0; print(io, 'a'); end else print(io, "Regex(") show(io, re.pattern) diff --git a/doc/REQUIRE b/doc/REQUIRE index b24ae448a9bd2b..74a6339f50a266 100644 --- a/doc/REQUIRE +++ b/doc/REQUIRE @@ -1,3 +1,3 @@ Compat 0.62.1 0.62.1+ DocStringExtensions 0.4.4 0.4.4+ -Documenter 0.17.0 0.17.0+ +Documenter 0.18.0 0.18.0+ diff --git a/doc/make.jl b/doc/make.jl index ac2f9f79fb389a..3e60e99ed9f0c7 100644 --- a/doc/make.jl +++ b/doc/make.jl @@ -177,6 +177,7 @@ makedocs( ENV["TRAVIS_JULIA_VERSION"] = "nightly" deploydocs( + julia = "nightly", repo = "github.com/JuliaLang/julia.git", target = "_build/html/en", dirname = "en", diff --git a/doc/src/manual/interfaces.md b/doc/src/manual/interfaces.md index 9441ae1e9303bf..5f418cda1fa6c8 100644 --- a/doc/src/manual/interfaces.md +++ b/doc/src/manual/interfaces.md @@ -525,7 +525,7 @@ list can — and often does — include other nested `Broadcasted` wrappers. For a complete example, let's say you have created a type, `ArrayAndChar`, that stores an array and a single character: -```jldoctest ArrayAndChar +```jldoctest ArrayAndChar; output = false struct ArrayAndChar{T,N} <: AbstractArray{T,N} data::Array{T,N} char::Char @@ -540,14 +540,14 @@ Base.showarg(io::IO, A::ArrayAndChar, toplevel) = print(io, typeof(A), " with ch You might want broadcasting to preserve the `char` "metadata." First we define -```jldoctest ArrayAndChar +```jldoctest ArrayAndChar; output = false Base.BroadcastStyle(::Type{<:ArrayAndChar}) = Broadcast.ArrayStyle{ArrayAndChar}() # output ``` This means we must also define a corresponding `similar` method: -```jldoctest ArrayAndChar; filter = r"(^find_aac \(generic function with 5 methods\)$|^$)" +```jldoctest ArrayAndChar; output = false function Base.similar(bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{ArrayAndChar}}, ::Type{ElType}) where ElType # Scan the inputs for the ArrayAndChar: A = find_aac(bc) @@ -562,7 +562,7 @@ find_aac(x) = x find_aac(a::ArrayAndChar, rest) = a find_aac(::Any, rest) = find_aac(rest) # output - +find_aac (generic function with 5 methods) ``` From these definitions, one obtains the following behavior: diff --git a/test/regex.jl b/test/regex.jl index fe5ce3c7f58bda..8882fa6985bf35 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -73,3 +73,7 @@ end @test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined @test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence @test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff) + +# 'a' flag to disable UCP +@test match(r"\w+", "Düsseldorf").match == "Düsseldorf" +@test match(r"\w+"a, "Düsseldorf").match == "D"