Make UCP option the default for regex matching

Fixes JuliaLang#27084. Regexes now match based on unicode character properties, rather than just ASCII character properties, e.g. `match(r"\w+", "café")` will now match the entire word (and not just `caf`). This behavior can be disabled with the `a` flag to the regex string macro (e.g. `r"\w+"a`).
Liozou · May 24, 2018 · fd57df3 · fd57df3
1 parent 2b3ab09
commit fd57df3
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 10 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -472,6 +472,11 @@ This section lists changes that do not have deprecation warnings.
   * `mv`,`cp`, `touch`, `mkdir`, `mkpath` now return the path that was created/modified
     rather than `nothing` ([#27071]).
 
+  * Regular expressions now default to UCP mode. Escape sequences such as `\w`
+    will now match based on unicode character properties, e.g. `r"\w+"` will
+    match `café` (not just `caf`). Add the `a` modifier (e.g. `r"\w+"a`) to
+    restore the previous behavior ([#27189]).
+
 Library improvements
 --------------------
 

diff --git a/base/pcre.jl b/base/pcre.jl
@@ -49,7 +49,8 @@ const COMPILE_MASK      =
       NO_START_OPTIMIZE |
       NO_UTF_CHECK      |
       UNGREEDY          |
-      UTF
+      UTF               |
+      UCP
 
 const EXECUTE_MASK      =
       NEWLINE_ANY       |

diff --git a/base/regex.jl b/base/regex.jl
@@ -4,7 +4,7 @@
 
 include("pcre.jl")
 
-const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX
+const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP
 const DEFAULT_MATCH_OPTS = zero(UInt32)
 
 mutable struct Regex
@@ -40,11 +40,15 @@ end
 function Regex(pattern::AbstractString, flags::AbstractString)
     options = DEFAULT_COMPILER_OPTS
     for f in flags
-        options |= f=='i' ? PCRE.CASELESS  :
-                   f=='m' ? PCRE.MULTILINE :
-                   f=='s' ? PCRE.DOTALL    :
-                   f=='x' ? PCRE.EXTENDED  :
-                   throw(ArgumentError("unknown regex flag: $f"))
+        if f == 'a'
+            options &= ~PCRE.UCP
+        else
+            options |= f=='i' ? PCRE.CASELESS  :
+                       f=='m' ? PCRE.MULTILINE :
+                       f=='s' ? PCRE.DOTALL    :
+                       f=='x' ? PCRE.EXTENDED  :
+                       throw(ArgumentError("unknown regex flag: $f"))
+        end
     end
     Regex(pattern, options, DEFAULT_MATCH_OPTS)
 end
@@ -72,8 +76,12 @@ after the ending quote, to change its behaviour:
 - `s` allows the `.` modifier to match newlines.
 - `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
   is treated as starting a comment.
+- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
+  `\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
+  these sequences only match ASCII characters.
+
 
-For example, this regex has all three flags enabled:
+For example, this regex has the first three flags enabled:
 
 ```jldoctest
 julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n")
@@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world")
 macro r_str(pattern, flags...) Regex(pattern, flags...) end
 
 function show(io::IO, re::Regex)
-    imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
+    imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
     opts = re.compile_options
-    if (opts & ~imsx) == DEFAULT_COMPILER_OPTS
+    if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
         print(io, 'r')
         print_quoted_literal(io, re.pattern)
         if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
         if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
         if (opts & PCRE.DOTALL   ) != 0; print(io, 's'); end
         if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
+        if (opts & PCRE.UCP      ) == 0; print(io, 'a'); end
     else
         print(io, "Regex(")
         show(io, re.pattern)

diff --git a/test/regex.jl b/test/regex.jl
@@ -73,3 +73,7 @@ end
 @test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined
 @test_throws ErrorException Regex("\xc0\x80") #  overlong 2-byte sequence
 @test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)
+
+# 'a' flag to disable UCP
+@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
+@test match(r"\w+"a, "Düsseldorf").match == "D"