Skip to content
13 changes: 7 additions & 6 deletions spec/std/regex_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -201,17 +201,18 @@ describe "Regex" do
end

it "doesn't crash with a large single line string" do
str = File.read(datapath("large_single_line_string.txt"))

{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
pending! "PCRE JIT mode not available." unless 1 == jit_enabled

str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
{% else %}
# This spec requires a fairly large depth limit. Some package builds
# have a more restrictive value which would make this test fail.
pending! "PCRE2 depth limit too low" unless Regex::PCRE2.config(LibPCRE2::CONFIG_DEPTHLIMIT, UInt32) > 8192
# Can't use regex literal because the *LIMIT_DEPTH verb is not supported in libpcre (only libpcre2)
# and thus the compiler doesn't recognize it.
str.matches?(Regex.new("(*LIMIT_DEPTH=8192)^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$"))
{% end %}

str = File.read(datapath("large_single_line_string.txt"))
str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
# We don't care whether this actually matches or not, it's just to make
# sure the engine does not stack overflow with a large string.
end
Expand Down
16 changes: 15 additions & 1 deletion src/regex/lib_pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,23 @@ lib LibPCRE2
fun compile = pcre2_compile_8(pattern : UInt8*, length : LibC::SizeT, options : UInt32, errorcode : LibC::SizeT*, erroroffset : Int*, ccontext : CompileContext*) : Code*
fun code_free = pcre2_code_free_8(code : Code*) : Void

type MatchContext = Void*
fun match_context_create = pcre2_match_context_create_8(gcontext : Void*) : MatchContext

JIT_COMPLETE = 0x00000001_u32 # For full matching
JIT_PARTIAL_SOFT = 0x00000002_u32
JIT_PARTIAL_HARD = 0x00000004_u32
JIT_INVALID_UTF = 0x00000100_u32
fun jit_compile = pcre2_jit_compile_8(code : Code*, options : UInt32) : Int

type JITStack = Void*

fun jit_stack_create = pcre2_jit_stack_create_8(startsize : LibC::SizeT, maxsize : LibC::SizeT, gcontext : GeneralContext) : JITStack
fun jit_stack_assign = pcre2_jit_stack_assign_8(mcontext : MatchContext, callable_function : Void*, callable_data : Void*) : Void

fun pattern_info = pcre2_pattern_info_8(code : Code*, what : UInt32, where : Void*) : Int

fun match = pcre2_match_8(code : Code*, subject : UInt8*, length : LibC::SizeT, startoffset : LibC::SizeT, options : UInt32, match_data : MatchData*, mcontext : Void*) : Int
fun match = pcre2_match_8(code : Code*, subject : UInt8*, length : LibC::SizeT, startoffset : LibC::SizeT, options : UInt32, match_data : MatchData*, mcontext : MatchContext) : Int
fun match_data_create_from_pattern = pcre2_match_data_create_from_pattern_8(code : Code*, gcontext : GeneralContext) : MatchData*
fun match_data_free = pcre2_match_data_free_8(match_data : MatchData*) : Void

Expand Down
31 changes: 30 additions & 1 deletion src/regex/pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@ module Regex::PCRE2
@re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
raise ArgumentError.new(error_message)
end

jit_compile
end

private def jit_compile : Nil
ret = LibPCRE2.jit_compile(@re, LibPCRE2::JIT_COMPLETE)
if ret < 0
case error = LibPCRE2::Error.new(ret)
when .jit_badoption?
# okay
else
raise ArgumentError.new("Regex JIT compile error: #{error}")

@beta-ziliani beta-ziliani Jan 3, 2023

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this a bit too harsh? Without compilation the application might still work, even if slow. This said, I'm not sure how could we let the user know that something went wrong.

@straight-shoota straight-shoota Jan 3, 2023

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what errors can actually reasonably appear here. When JIT compilation is not supported, that's indicated by ERROR_JIT_BADOPTION and handled in the previous branch (we ignore that).
Anything else, I don't know. But I would probably expect that could be more fatal errors where you may not be able to just continue on.
A more plausible point for ignoring errors might be when JIT stack allocation fails (jit_stack_create).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PCRE2_ERROR_NOMEMORY is another one we might want to not raise. I wonder if it makes sense to just ignore any error and have a @compiled : Bool to know if it was compiled or not

end
end
end

protected def self.compile(source, options)
Expand Down Expand Up @@ -123,9 +137,24 @@ module Regex::PCRE2
LibPCRE2.general_context_create(->(size : LibC::Int, data : Void*) { GC.malloc(size) }.pointer, ->(pointer : Void*, data : Void*) { GC.free(pointer) }.pointer, nil)
end

# Returns a JIT stack that's shared in the current thread.
#
# Only a single `match` function can run per thread at any given time, so there
# can't be any concurrent access to the JIT stack.
@[ThreadLocal]
class_getter jit_stack : LibPCRE2::JITStack do
jit_stack = LibPCRE2.jit_stack_create(32_768, 1_048_576, Regex::PCRE2.general_context)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for a follow up, but these magic numbers could be or'ed with the ENV in order to have some control. I'm thinking in particular of memory-restricted apps.

if jit_stack.null?
raise "Error allocating JIT stack"
end
jit_stack
end

private def match_data(str, byte_index, options)
match_data = LibPCRE2.match_data_create_from_pattern(@re, Regex::PCRE2.general_context)
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, nil)
match_context = LibPCRE2.match_context_create(nil)
LibPCRE2.jit_stack_assign(match_context, nil, Regex::PCRE2.jit_stack.as(Void*))
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, match_context)

if match_count < 0
case error = LibPCRE2::Error.new(match_count)
Expand Down