Skip to content

Commit

Permalink
Make UCP option the default for regex matching
Browse files Browse the repository at this point in the history
Fixes JuliaLang#27084. Regexes now match based on unicode character properties,
rather than just ASCII character properties, e.g. `match(r"\w+", "café")`
will now match the entire word (and not just `caf`). This behavior can
be disabled with the `a` flag to the regex string macro (e.g. `r"\w+"a`).
  • Loading branch information
Keno authored and Liozou committed May 24, 2018
1 parent 2b3ab09 commit fd57df3
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 10 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,11 @@ This section lists changes that do not have deprecation warnings.
* `mv`,`cp`, `touch`, `mkdir`, `mkpath` now return the path that was created/modified
rather than `nothing` ([#27071]).

* Regular expressions now default to UCP mode. Escape sequences such as `\w`
will now match based on unicode character properties, e.g. `r"\w+"` will
match `café` (not just `caf`). Add the `a` modifier (e.g. `r"\w+"a`) to
restore the previous behavior ([#27189]).

Library improvements
--------------------

Expand Down
3 changes: 2 additions & 1 deletion base/pcre.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ const COMPILE_MASK =
NO_START_OPTIMIZE |
NO_UTF_CHECK |
UNGREEDY |
UTF
UTF |
UCP

const EXECUTE_MASK =
NEWLINE_ANY |
Expand Down
27 changes: 18 additions & 9 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

include("pcre.jl")

const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX
const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP
const DEFAULT_MATCH_OPTS = zero(UInt32)

mutable struct Regex
Expand Down Expand Up @@ -40,11 +40,15 @@ end
function Regex(pattern::AbstractString, flags::AbstractString)
options = DEFAULT_COMPILER_OPTS
for f in flags
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
if f == 'a'
options &= ~PCRE.UCP
else
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
end
end
Regex(pattern, options, DEFAULT_MATCH_OPTS)
end
Expand Down Expand Up @@ -72,8 +76,12 @@ after the ending quote, to change its behaviour:
- `s` allows the `.` modifier to match newlines.
- `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
is treated as starting a comment.
- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
`\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
these sequences only match ASCII characters.
For example, this regex has all three flags enabled:
For example, this regex has the first three flags enabled:
```jldoctest
julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n")
Expand All @@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world")
macro r_str(pattern, flags...) Regex(pattern, flags...) end

function show(io::IO, re::Regex)
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
opts = re.compile_options
if (opts & ~imsx) == DEFAULT_COMPILER_OPTS
if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
print(io, 'r')
print_quoted_literal(io, re.pattern)
if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end
if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
if (opts & PCRE.UCP ) == 0; print(io, 'a'); end
else
print(io, "Regex(")
show(io, re.pattern)
Expand Down
4 changes: 4 additions & 0 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@ end
@test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined
@test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence
@test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)

# 'a' flag to disable UCP
@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
@test match(r"\w+"a, "Düsseldorf").match == "D"

0 comments on commit fd57df3

Please sign in to comment.