Skip to content

Commit

Permalink
Merge branch 'master' into jb/async
Browse files Browse the repository at this point in the history
  • Loading branch information
JeffBezanson authored May 22, 2018
2 parents 09f9d63 + 2f728b8 commit 816d742
Show file tree
Hide file tree
Showing 9 changed files with 40 additions and 20 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,11 @@ This section lists changes that do not have deprecation warnings.
a called function `f`, have `f` return the task and put `@async wait(f(...))` within
the `@sync` block ([#27164]).

* Regular expressions now default to UCP mode. Escape sequences such as `\w`
will now match based on unicode character properties, e.g. `r"\w+"` will
match `café` (not just `caf`). Add the `a` modifier (e.g. `r"\w+"a`) to
restore the previous behavior ([#27189]).

Library improvements
--------------------

Expand Down
8 changes: 4 additions & 4 deletions base/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -448,28 +448,28 @@ ENDIAN_BOM
"""
ntoh(x)
Converts the endianness of a value from Network byte order (big-endian) to that used by the Host.
Convert the endianness of a value from Network byte order (big-endian) to that used by the Host.
"""
ntoh(x)

"""
hton(x)
Converts the endianness of a value from that used by the Host to Network byte order (big-endian).
Convert the endianness of a value from that used by the Host to Network byte order (big-endian).
"""
hton(x)

"""
ltoh(x)
Converts the endianness of a value from Little-endian to that used by the Host.
Convert the endianness of a value from Little-endian to that used by the Host.
"""
ltoh(x)

"""
htol(x)
Converts the endianness of a value from that used by the Host to Little-endian.
Convert the endianness of a value from that used by the Host to Little-endian.
"""
htol(x)

Expand Down
2 changes: 1 addition & 1 deletion base/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if Sys.isunix()
elseif Sys.iswindows()
const path_separator = "\\"
const path_separator_re = r"[/\\]+"
const path_absolute_re = r"^(?:\w+:)?[/\\]"
const path_absolute_re = r"^(?:[A-Za-z]+:)?[/\\]"
const path_directory_re = r"(?:^|[/\\])\.{0,2}$"
const path_dir_splitter = r"^(.*?)([/\\]+)([^/\\]*)$"
const path_ext_splitter = r"^((?:.*[/\\])?(?:\.|[^/\\\.])[^/\\]*?)(\.[^/\\\.]*|)$"
Expand Down
3 changes: 2 additions & 1 deletion base/pcre.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ const COMPILE_MASK =
NO_START_OPTIMIZE |
NO_UTF_CHECK |
UNGREEDY |
UTF
UTF |
UCP

const EXECUTE_MASK =
NEWLINE_ANY |
Expand Down
27 changes: 18 additions & 9 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

include("pcre.jl")

const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX
const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP
const DEFAULT_MATCH_OPTS = zero(UInt32)

mutable struct Regex
Expand Down Expand Up @@ -40,11 +40,15 @@ end
function Regex(pattern::AbstractString, flags::AbstractString)
options = DEFAULT_COMPILER_OPTS
for f in flags
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
if f == 'a'
options &= ~PCRE.UCP
else
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
throw(ArgumentError("unknown regex flag: $f"))
end
end
Regex(pattern, options, DEFAULT_MATCH_OPTS)
end
Expand Down Expand Up @@ -72,8 +76,12 @@ after the ending quote, to change its behaviour:
- `s` allows the `.` modifier to match newlines.
- `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#`
is treated as starting a comment.
- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`,
`\\s`, `\\W`, `\\w`, etc. match based on Unicode character properties. With this option,
these sequences only match ASCII characters.
For example, this regex has all three flags enabled:
For example, this regex has the first three flags enabled:
```jldoctest
julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n")
Expand All @@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world")
macro r_str(pattern, flags...) Regex(pattern, flags...) end

function show(io::IO, re::Regex)
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP
opts = re.compile_options
if (opts & ~imsx) == DEFAULT_COMPILER_OPTS
if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa)
print(io, 'r')
print_quoted_literal(io, re.pattern)
if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end
if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end
if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end
if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end
if (opts & PCRE.UCP ) == 0; print(io, 'a'); end
else
print(io, "Regex(")
show(io, re.pattern)
Expand Down
2 changes: 1 addition & 1 deletion doc/REQUIRE
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Compat 0.62.1 0.62.1+
DocStringExtensions 0.4.4 0.4.4+
Documenter 0.17.0 0.17.0+
Documenter 0.18.0 0.18.0+
1 change: 1 addition & 0 deletions doc/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ makedocs(
ENV["TRAVIS_JULIA_VERSION"] = "nightly"

deploydocs(
julia = "nightly",
repo = "github.com/JuliaLang/julia.git",
target = "_build/html/en",
dirname = "en",
Expand Down
8 changes: 4 additions & 4 deletions doc/src/manual/interfaces.md
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ list can — and often does — include other nested `Broadcasted` wrappers.
For a complete example, let's say you have created a type, `ArrayAndChar`, that stores an
array and a single character:

```jldoctest ArrayAndChar
```jldoctest ArrayAndChar; output = false
struct ArrayAndChar{T,N} <: AbstractArray{T,N}
data::Array{T,N}
char::Char
Expand All @@ -540,14 +540,14 @@ Base.showarg(io::IO, A::ArrayAndChar, toplevel) = print(io, typeof(A), " with ch

You might want broadcasting to preserve the `char` "metadata." First we define

```jldoctest ArrayAndChar
```jldoctest ArrayAndChar; output = false
Base.BroadcastStyle(::Type{<:ArrayAndChar}) = Broadcast.ArrayStyle{ArrayAndChar}()
# output
```

This means we must also define a corresponding `similar` method:
```jldoctest ArrayAndChar; filter = r"(^find_aac \(generic function with 5 methods\)$|^$)"
```jldoctest ArrayAndChar; output = false
function Base.similar(bc::Broadcast.Broadcasted{Broadcast.ArrayStyle{ArrayAndChar}}, ::Type{ElType}) where ElType
# Scan the inputs for the ArrayAndChar:
A = find_aac(bc)
Expand All @@ -562,7 +562,7 @@ find_aac(x) = x
find_aac(a::ArrayAndChar, rest) = a
find_aac(::Any, rest) = find_aac(rest)
# output
find_aac (generic function with 5 methods)
```

From these definitions, one obtains the following behavior:
Expand Down
4 changes: 4 additions & 0 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,7 @@ end
@test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined
@test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence
@test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff)

# 'a' flag to disable UCP
@test match(r"\w+", "Düsseldorf").match == "Düsseldorf"
@test match(r"\w+"a, "Düsseldorf").match == "D"

0 comments on commit 816d742

Please sign in to comment.