Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement capture group references in substitution strings #11849

Merged
merged 1 commit into from
Jul 23, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,7 @@ export
# notation for certain types
@b_str, # byte vector
@r_str, # regex
@s_str, # regex substitution string
@v_str, # version number

# documentation
Expand Down
17 changes: 17 additions & 0 deletions base/pcre.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,23 @@ function substring_number_from_name(re, name)
(Ptr{Void}, Cstring), re, name)
end

function substring_length_bynumber(match_data, number)
s = Ref{Csize_t}()
rc = ccall((:pcre2_substring_length_bynumber_8, PCRE_LIB), Cint,
(Ptr{Void}, UInt32, Ref{Csize_t}), match_data, number, s)
rc < 0 && error("PCRE error: $(err_message(rc))")
convert(Int, s[])
end

function substring_copy_bynumber(match_data, number, buf, buf_size)
s = Ref{Csize_t}(buf_size)
rc = ccall((:pcre2_substring_copy_bynumber_8, PCRE_LIB), Cint,
(Ptr{Void}, UInt32, Ptr{UInt8}, Ref{Csize_t}),
match_data, number, buf, s)
rc < 0 && error("PCRE error: $(err_message(rc))")
convert(Int, s[])
end

function capture_names(re)
name_count = info(re, INFO_NAMECOUNT, UInt32)
name_entry_size = info(re, INFO_NAMEENTRYSIZE, UInt32)
Expand Down
83 changes: 83 additions & 0 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,89 @@ search(s::AbstractString, r::Regex, idx::Integer) =
throw(ArgumentError("regex search is only available for bytestrings; use bytestring(s) to convert"))
search(s::AbstractString, r::Regex) = search(s,r,start(s))

immutable SubstitutionString{T<:AbstractString} <: AbstractString
string::T
end

endof(s::SubstitutionString) = endof(s.string)
next(s::SubstitutionString, idx::Int) = next(s.string, idx)
function show(io::IO, s::SubstitutionString)
print(io, "s")
show(io, s.string)
end

macro s_str(string) SubstitutionString(string) end

replace_err(repl) = error("Bad replacement string: $repl")

function _write_capture(io, re, group)
len = PCRE.substring_length_bynumber(re.match_data, group)
ensureroom(io, len+1)
PCRE.substring_copy_bynumber(re.match_data, group,
pointer(io.data, io.ptr), len+1)
io.ptr += len
io.size = max(io.size, io.ptr - 1)
end

function _replace(io, repl_s::SubstitutionString, str, r, re)
const SUB_CHAR = '\\'
const GROUP_CHAR = 'g'
const LBRACKET = '<'
const RBRACKET = '>'
repl = repl_s.string
i = start(repl)
e = endof(repl)
while i <= e
if repl[i] == SUB_CHAR
next_i = nextind(repl, i)
next_i > e && replace_err(repl)
if repl[next_i] == SUB_CHAR
write(io, SUB_CHAR, repl[next_i])
i = nextind(repl, next_i)
elseif isnumber(repl[next_i])
group = parse(Int, repl[next_i])
i = nextind(repl, next_i)
while i <= e
if isnumber(repl[i])
group = 10group + parse(Int, repl[i])
i = nextind(repl, i)
else
break
end
end
_write_capture(io, re, group)
elseif repl[next_i] == GROUP_CHAR
i = nextind(repl, next_i)
if i > e || repl[i] != LBRACKET
replace_err(repl)
end
i = nextind(repl, i)
i > e && replace_err(repl)
groupstart = i
while repl[i] != RBRACKET
i = nextind(repl, i)
i > e && replace_err(repl)
end
# TODO: avoid this allocation
groupname = SubString(repl, groupstart, prevind(repl, i))
if isnumber(groupname)
_write_capture(io, re, parse(Int, groupname))
else
group = PCRE.substring_number_from_name(re.regex, groupname)
group < 0 && replace_err("Group $groupname not found in regex $re")
_write_capture(io, re, group)
end
i = nextind(repl, i)
else
replace_err(repl)
end
else
write(io, repl[i])
i = nextind(repl, i)
end
end
end

immutable RegexMatchIterator
regex::Regex
string::UTF8String
Expand Down
8 changes: 5 additions & 3 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,9 @@ function _rsplit{T<:AbstractString,U<:Array}(str::T, splitter, limit::Integer, k
end
#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false)

_replacement(repl, str, j, k) = repl
_replacement(repl::Function, str, j, k) = repl(SubString(str, j, k))
_replace(io, repl, str, r, pattern) = write(io, repl)
_replace(io, repl::Function, str, r, pattern) =
write(io, repl(SubString(str, first(r), last(r))))

function replace(str::ByteString, pattern, repl, limit::Integer)
n = 1
Expand All @@ -183,10 +184,11 @@ function replace(str::ByteString, pattern, repl, limit::Integer)
r = search(str,pattern,i)
j, k = first(r), last(r)
out = IOBuffer()
ensureroom(out, floor(Int, 1.2sizeof(str)))
while j != 0
if i == a || i <= k
write_sub(out, str.data, i, j-i)
write(out, _replacement(repl, str, j, k))
_replace(out, repl, str, r, pattern)
end
if k<j
i = j
Expand Down
14 changes: 14 additions & 0 deletions doc/manual/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,20 @@ with the number or name of the capture group::
julia> m[2]
"45"

Captures can be referenced in a substitution string when using :func:`replace`
by using ``\n`` to refer to the `n`th capture group and prefixing the
subsitution string with ``s``. Capture group 0 refers to the entire match object.
Named capture groups can be referenced in the substitution with ``g<groupname>``.
For example::

julia> replace("first second", r"(\w+) (?P<agroup>\w+), s"\g<agroup> \1")
julia> "second first"

Numbered capture groups can also be referenced as ``\g<n>`` for disambiguation,
as in::
julia> replace("a", r".", "\g<0>1")
julia> a1

You can modify the behavior of regular expressions by some combination
of the flags ``i``, ``m``, ``s``, and ``x`` after the closing double
quote mark. These flags have the same meaning as they do in Perl, as
Expand Down
2 changes: 1 addition & 1 deletion doc/stdlib/strings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@

.. function:: replace(string, pat, r[, n])

Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring.
Search for the given pattern ``pat``, and replace each occurrence with ``r``. If ``n`` is provided, replace at most ``n`` occurrences. As with search, the second argument may be a single character, a vector or a set of characters, a string, or a regular expression. If ``r`` is a function, each occurrence is replaced with ``r(s)`` where ``s`` is the matched substring. If ``pat`` is a regular expression and ``r`` is a ``SubstitutionString``, then capture group references in ``r`` are replaced with the corresponding matched text.

.. function:: split(string, [chars]; limit=0, keep=true)

Expand Down
11 changes: 8 additions & 3 deletions test/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ show(buf, r"")
@test_throws ArgumentError search(utf32("this is a test"), r"test")

# Named subpatterns
m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"
let m = match(r"(?<a>.)(.)(?<b>.)", "xyz")
@test (m[:a], m[2], m["b"]) == ("x", "y", "z")
@test sprint(show, m) == "RegexMatch(\"xyz\", a=\"x\", 2=\"y\", b=\"z\")"
end

# Backcapture reference in substitution string
@test replace("abcde", r"(..)(?P<byname>d)", s"\g<byname>xy\1") == "adxybce"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be good to add a test for when the groupname doesn't exist in the regex.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

@test_throws ErrorException replace("a", r"(?P<x>)", s"\g<y>")