Merge branch 'master' into jb/async

JuliaLang · May 22, 2018 · 816d742 · 816d742
2 parents 09f9d63 + 2f728b8
commit 816d742
Show file tree

Hide file tree

Showing 9 changed files with 40 additions and 20 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -477,6 +477,11 @@ This section lists changes that do not have deprecation warnings.
     a called function `f`, have `f` return the task and put `@async wait(f(...))` within
     the `@sync` block ([#27164]).
 
+  * Regular expressions now default to UCP mode. Escape sequences such as `\w`
+    will now match based on unicode character properties, e.g. `r"\w+"` will
+    match `café` (not just `caf`). Add the `a` modifier (e.g. `r"\w+"a`) to
+    restore the previous behavior ([#27189]).
+
 Library improvements
 --------------------
 

diff --git a/base/io.jl b/base/io.jl
@@ -448,28 +448,28 @@ ENDIAN_BOM
 """
     ntoh(x)
 
-Converts the endianness of a value from Network byte order (big-endian) to that used by the Host.
+Convert the endianness of a value from Network byte order (big-endian) to that used by the Host.
 """
 ntoh(x)
 
 """
     hton(x)
 
-Converts the endianness of a value from that used by the Host to Network byte order (big-endian).
+Convert the endianness of a value from that used by the Host to Network byte order (big-endian).
 """
 hton(x)
 
 """
     ltoh(x)
 
-Converts the endianness of a value from Little-endian to that used by the Host.
+Convert the endianness of a value from Little-endian to that used by the Host.
 """
 ltoh(x)
 
 """
     htol(x)
 
-Converts the endianness of a value from that used by the Host to Little-endian.
+Convert the endianness of a value from that used by the Host to Little-endian.
 """
 htol(x)
 

diff --git a/base/path.jl b/base/path.jl
@@ -27,7 +27,7 @@ if Sys.isunix()
 elseif Sys.iswindows()
     const path_separator    = "\\"
     const path_separator_re = r"[/\\]+"
-    const path_absolute_re  = r"^(?:\w+:)?[/\\]"
+    const path_absolute_re  = r"^(?:[A-Za-z]+:)?[/\\]"
     const path_directory_re = r"(?:^|[/\\])\.{0,2}$"
     const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"
     const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"

diff --git a/base/pcre.jl b/base/pcre.jl
@@ -49,7 +49,8 @@ const COMPILE_MASK      =
       NO_START_OPTIMIZE |
       NO_UTF_CHECK      |
       UNGREEDY          |
-      UTF
+      UTF               |
+      UCP
 
 const EXECUTE_MASK      =
       NEWLINE_ANY       |

diff --git a/base/regex.jl b/base/regex.jl
@@ -4,7 +4,7 @@
 
 include("pcre.jl")
 
-const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX
+const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP
 const DEFAULT_MATCH_OPTS = zero(UInt32)
 
 mutable struct Regex
@@ -40,11 +40,15 @@ end
 function Regex(pattern::AbstractString, flags::AbstractString)
     options = DEFAULT_COMPILER_OPTS
     for f in flags
-        options |= f=='i' ? PCRE.CASELESS  :
-                   f=='m' ? PCRE.MULTILINE :
-                   f=='s' ? PCRE.DOTALL    :
-                   f=='x' ? PCRE.EXTENDED  :
-                   throw(ArgumentError("unknown regex flag: $f"))
+        if f == 'a'
+            options &= ~PCRE.UCP
+        else
+            options |= f=='i' ? PCRE.CASELESS  :
+                       f=='m' ? PCRE.MULTILINE :
+                       f=='s' ? PCRE.DOTALL    :
+                       f=='x' ? PCRE.EXTENDED  :
+                       throw(ArgumentError("unknown regex flag: $f"))
+        end
     end
     Regex(pattern, options, DEFAULT_MATCH_OPTS)
 end
@@ -72,8 +76,12 @@ after the ending quote, to change its behaviour:
 - `s` allows the `.` modifier to match newlines.
 - `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
   is treated as starting a comment.
+- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
+  `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
+  these sequences only match ASCII characters.
+
 
-For example, this regex has all three flags enabled:
+For example, this regex has the first three flags enabled:
 
 ```jldoctest
 julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n")
@@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world")
 macro r_str(pattern, flags...) Regex(pattern, flags...) end
 
 function show(io::IO, re::Regex)
-    imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
+    imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
     opts = re.compile_options
-    if (opts & ~imsx) == DEFAULT_COMPILER_OPTS
+    if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
         print(io, 'r')
         print_quoted_literal(io, re.pattern)
         if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
         if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
         if (opts & PCRE.DOTALL   ) != 0; print(io, 's'); end
         if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
+        if (opts & PCRE.UCP      ) == 0; print(io, 'a'); end
     else
         print(io, "Regex(")
         show(io, re.pattern)

diff --git a/doc/REQUIRE b/doc/REQUIRE
@@ -1,3 +1,3 @@
 Compat 0.62.1 0.62.1+
 DocStringExtensions 0.4.4 0.4.4+
-Documenter 0.17.0 0.17.0+
+Documenter 0.18.0 0.18.0+
diff --git a/doc/make.jl b/doc/make.jl
@@ -177,6 +177,7 @@ makedocs(
 ENV["TRAVIS_JULIA_VERSION"] = "nightly"
 
 deploydocs(
+    julia = "nightly",
     repo = "github.com/JuliaLang/julia.git",
     target = "_build/html/en",
     dirname = "en",

diff --git a/doc/src/manual/interfaces.md b/doc/src/manual/interfaces.md
@@ -525,7 +525,7 @@ list can — and often does — include other nested `Broadcasted` wrappers.
 For a complete example, let's say you have created a type, `ArrayAndChar`, that stores an
 array and a single character:
 
-```jldoctest ArrayAndChar
+```jldoctest ArrayAndChar; output = false
 struct ArrayAndChar{T,N} <: AbstractArray{T,N}
     data::Array{T,N}
     char::Char
@@ -540,14 +540,14 @@ Base.showarg(io::IO, A::ArrayAndChar, toplevel) = print(io, typeof(A), " with ch
 
 You might want broadcasting to preserve the `char` "metadata." First we define
 
-```jldoctest ArrayAndChar
+```jldoctest ArrayAndChar; output = false
 Base.BroadcastStyle(::Type{<:ArrayAndChar}) = Broadcast.ArrayStyle{ArrayAndChar}()
 # output
 
 ```
 
 This means we must also define a corresponding `similar` method:
-```jldoctest ArrayAndChar; filter = r"(^find_aac \(generic function with 5 methods\)$|^$)"
+```jldoctest ArrayAndChar; output = false
 function Base.similar(bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{ArrayAndChar}}, ::Type{ElType}) where ElType
     # Scan the inputs for the ArrayAndChar:
     A = find_aac(bc)
@@ -562,7 +562,7 @@ find_aac(x) = x
 find_aac(a::ArrayAndChar, rest) = a
 find_aac(::Any, rest) = find_aac(rest)
 # output
-
+find_aac (generic function with 5 methods)
 ```
 
 From these definitions, one obtains the following behavior:

diff --git a/test/regex.jl b/test/regex.jl
@@ -73,3 +73,7 @@ end
 @test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined
 @test_throws ErrorException Regex("\xc0\x80") #  overlong 2-byte sequence
 @test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)
+
+# 'a' flag to disable UCP
+@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
+@test match(r"\w+"a, "Düsseldorf").match == "D"