Skip to content

Commit

Permalink
Preserve regex captures across stack frames (#1447)
Browse files Browse the repository at this point in the history
* privatize state.RegexCaptures

* stack frame for regex captures

* merge

* unit-test case

* docs re stack frames for regex captures

* more
  • Loading branch information
johnkerl authored Dec 18, 2023
1 parent 1ae670f commit 4053d76
Show file tree
Hide file tree
Showing 20 changed files with 183 additions and 77 deletions.
46 changes: 23 additions & 23 deletions docs/src/data-diving-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1
<b> stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012</b>
</pre>
<pre class="pre-non-highlight-in-pair">
tiv_2011_tiv_2012_corr 0.9730497632351692
tiv_2011_tiv_2012_ols_m 0.9835583980337723
tiv_2011_tiv_2012_ols_b 433854.6428968317
tiv_2011_tiv_2012_corr 0.9730497632351701
tiv_2011_tiv_2012_ols_m 0.9835583980337732
tiv_2011_tiv_2012_ols_b 433854.6428968301
tiv_2011_tiv_2012_ols_n 36634
tiv_2011_tiv_2012_r2 0.9468258417320189
tiv_2011_tiv_2012_r2 0.9468258417320204
</pre>

<pre class="pre-highlight-in-pair">
Expand Down Expand Up @@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
u_v_corr w_x_corr
0.1334180491027861 -0.011319841199866178
0.1334180491027861 -0.011319841199852926
</pre>

<pre class="pre-highlight-in-pair">
Expand All @@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
</pre>
<pre class="pre-non-highlight-in-pair">
color shape u_v_corr w_x_corr
red circle 0.9807984401887236 -0.01856553658708754
orange square 0.17685855992752927 -0.07104431573806054
green circle 0.05764419437577255 0.01179572988801509
red square 0.05574477124893523 -0.0006801456507510942
yellow triangle 0.04457273771962798 0.024604310103081825
yellow square 0.04379172927296089 -0.04462197201631237
purple circle 0.03587354936895086 0.1341133954140899
blue square 0.03241153095761164 -0.053507648119643196
blue triangle 0.015356427073158766 -0.0006089997461435399
orange circle 0.010518953877704048 -0.16279397329279383
red triangle 0.00809782571528034 0.012486621357942596
purple triangle 0.005155190909099334 -0.045057909256220656
purple square -0.025680276963377404 0.05769429647930396
green square -0.0257760734502851 -0.003265173252087127
orange triangle -0.030456661186085785 -0.1318699981926352
yellow circle -0.06477331572781474 0.07369449819706045
blue circle -0.10234761901929677 -0.030528539069837757
green triangle -0.10901825107358765 -0.04848782060162929
red circle 0.9807984401887242 -0.018565536587084836
orange square 0.17685855992752933 -0.07104431573805543
green circle 0.05764419437577257 0.011795729888018455
red square 0.0557447712489348 -0.0006801456507506415
yellow triangle 0.0445727377196281 0.024604310103079844
yellow square 0.0437917292729612 -0.044621972016306265
purple circle 0.03587354936895115 0.13411339541407613
blue square 0.03241153095761152 -0.05350764811965621
blue triangle 0.015356427073158612 -0.0006089997461408209
orange circle 0.010518953877704181 -0.1627939732927932
red triangle 0.00809782571528054 0.01248662135795501
purple triangle 0.005155190909099739 -0.04505790925621933
purple square -0.02568027696337717 0.057694296479293694
green square -0.025776073450284875 -0.0032651732520739014
orange triangle -0.030456661186085584 -0.13186999819263814
yellow circle -0.06477331572781515 0.0736944981970553
blue circle -0.1023476190192966 -0.030528539069839333
green triangle -0.10901825107358747 -0.04848782060162855
</pre>
18 changes: 1 addition & 17 deletions docs/src/reference-dsl-builtin-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ is 2. Unary operators such as `!` and `~` show argument-count of 1; the ternary
* [**Higher-order-functions functions**](#higher-order-functions-functions): [any](#any), [apply](#apply), [every](#every), [fold](#fold), [reduce](#reduce), [select](#select), [sort](#sort).
* [**Math functions**](#math-functions): [abs](#abs), [acos](#acos), [acosh](#acosh), [asin](#asin), [asinh](#asinh), [atan](#atan), [atan2](#atan2), [atanh](#atanh), [cbrt](#cbrt), [ceil](#ceil), [cos](#cos), [cosh](#cosh), [erf](#erf), [erfc](#erfc), [exp](#exp), [expm1](#expm1), [floor](#floor), [invqnorm](#invqnorm), [log](#log), [log10](#log10), [log1p](#log1p), [logifit](#logifit), [max](#max), [min](#min), [qnorm](#qnorm), [round](#round), [roundm](#roundm), [sgn](#sgn), [sin](#sin), [sinh](#sinh), [sqrt](#sqrt), [tan](#tan), [tanh](#tanh), [urand](#urand), [urand32](#urand32), [urandelement](#urandelement), [urandint](#urandint), [urandrange](#urandrange).
* [**Stats functions**](#stats-functions): [antimode](#antimode), [count](#count), [distinct_count](#distinct_count), [kurtosis](#kurtosis), [maxlen](#maxlen), [mean](#mean), [meaneb](#meaneb), [median](#median), [minlen](#minlen), [mode](#mode), [null_count](#null_count), [percentile](#percentile), [percentiles](#percentiles), [skewness](#skewness), [sort_collection](#sort_collection), [stddev](#stddev), [sum](#sum), [sum2](#sum2), [sum3](#sum3), [sum4](#sum4), [variance](#variance).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [match](#match), [matchx](#matchx), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**String functions**](#string-functions): [capitalize](#capitalize), [clean_whitespace](#clean_whitespace), [collapse_whitespace](#collapse_whitespace), [contains](#contains), [format](#format), [gssub](#gssub), [gsub](#gsub), [index](#index), [latin1_to_utf8](#latin1_to_utf8), [leftpad](#leftpad), [lstrip](#lstrip), [regextract](#regextract), [regextract_or_else](#regextract_or_else), [rightpad](#rightpad), [rstrip](#rstrip), [ssub](#ssub), [strip](#strip), [strlen](#strlen), [sub](#sub), [substr](#substr), [substr0](#substr0), [substr1](#substr1), [tolower](#tolower), [toupper](#toupper), [truncate](#truncate), [unformat](#unformat), [unformatx](#unformatx), [utf8_to_latin1](#utf8_to_latin1), [\.](#dot).
* [**System functions**](#system-functions): [exec](#exec), [hostname](#hostname), [os](#os), [system](#system), [version](#version).
* [**Time functions**](#time-functions): [dhms2fsec](#dhms2fsec), [dhms2sec](#dhms2sec), [fsec2dhms](#fsec2dhms), [fsec2hms](#fsec2hms), [gmt2localtime](#gmt2localtime), [gmt2nsec](#gmt2nsec), [gmt2sec](#gmt2sec), [hms2fsec](#hms2fsec), [hms2sec](#hms2sec), [localtime2gmt](#localtime2gmt), [localtime2nsec](#localtime2nsec), [localtime2sec](#localtime2sec), [nsec2gmt](#nsec2gmt), [nsec2gmtdate](#nsec2gmtdate), [nsec2localdate](#nsec2localdate), [nsec2localtime](#nsec2localtime), [sec2dhms](#sec2dhms), [sec2gmt](#sec2gmt), [sec2gmtdate](#sec2gmtdate), [sec2hms](#sec2hms), [sec2localdate](#sec2localdate), [sec2localtime](#sec2localtime), [strfntime](#strfntime), [strfntime_local](#strfntime_local), [strftime](#strftime), [strftime_local](#strftime_local), [strpntime](#strpntime), [strpntime_local](#strpntime_local), [strptime](#strptime), [strptime_local](#strptime_local), [sysntime](#sysntime), [systime](#systime), [systimeint](#systimeint), [upntime](#upntime), [uptime](#uptime).
* [**Typing functions**](#typing-functions): [asserting_absent](#asserting_absent), [asserting_array](#asserting_array), [asserting_bool](#asserting_bool), [asserting_boolean](#asserting_boolean), [asserting_empty](#asserting_empty), [asserting_empty_map](#asserting_empty_map), [asserting_error](#asserting_error), [asserting_float](#asserting_float), [asserting_int](#asserting_int), [asserting_map](#asserting_map), [asserting_nonempty_map](#asserting_nonempty_map), [asserting_not_array](#asserting_not_array), [asserting_not_empty](#asserting_not_empty), [asserting_not_map](#asserting_not_map), [asserting_not_null](#asserting_not_null), [asserting_null](#asserting_null), [asserting_numeric](#asserting_numeric), [asserting_present](#asserting_present), [asserting_string](#asserting_string), [is_absent](#is_absent), [is_array](#is_array), [is_bool](#is_bool), [is_boolean](#is_boolean), [is_empty](#is_empty), [is_empty_map](#is_empty_map), [is_error](#is_error), [is_float](#is_float), [is_int](#is_int), [is_map](#is_map), [is_nan](#is_nan), [is_nonempty_map](#is_nonempty_map), [is_not_array](#is_not_array), [is_not_empty](#is_not_empty), [is_not_map](#is_not_map), [is_not_null](#is_not_null), [is_null](#is_null), [is_numeric](#is_numeric), [is_present](#is_present), [is_string](#is_string), [typeof](#typeof).
Expand Down Expand Up @@ -1296,22 +1296,6 @@ lstrip (class=string #args=1) Strip leading whitespace from string.
</pre>


### match
<pre class="pre-non-highlight-non-pair">
match (class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
</pre>


### matchx
<pre class="pre-non-highlight-non-pair">
matchx (class=string #args=2) TODO: WRITE ME
Example:
TODO: WRITE ME
</pre>


### regextract
<pre class="pre-non-highlight-non-pair">
regextract (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.
Expand Down
20 changes: 19 additions & 1 deletion docs/src/reference-main-regular-expressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ name=bull,regex=^b[ou]ll$

## Regex captures

Regex captures of the form `\0` through `\9` are supported as
Regex captures of the form `\0` through `\9` are supported as follows:

* Captures have in-function context for `sub` and `gsub`. For example, the first `\1,\2` pair belong to the first `sub` and the second `\1,\2` pair belong to the second `sub`:

Expand All @@ -77,6 +77,24 @@ Regex captures of the form `\0` through `\9` are supported as
<b>mlr put '$a =~ "(..)_(....); $b = "left_\1"; $c = "right_\2"'</b>
</pre>

* Each user-defined function has its own frame for captures. For example:

<pre class="pre-highlight-non-pair">
<b>mlr -n put '</b>
<b>func f() {</b>
<b> if ("456 defg" =~ "([0-9]+) ([a-z]+)") {</b>
<b> print "INNER: \1 \2";</b>
<b> }</b>
<b>}</b>
<b>end {</b>
<b> if ("123 abc" =~ "([0-9]+) ([a-z]+)") {</b>
<b> print "OUTER PRE: \1 \2";</b>
<b> f();</b>
<b> print "OUTER POST: \1 \2";</b>
<b> }</b>
<b>}'</b>
</pre>

* The captures are not retained across multiple puts. For example, here the `\1,\2` won't be expanded from the regex capture:

<pre class="pre-highlight-non-pair">
Expand Down
20 changes: 19 additions & 1 deletion docs/src/reference-main-regular-expressions.md.in
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ GENMD-EOF

## Regex captures

Regex captures of the form `\0` through `\9` are supported as
Regex captures of the form `\0` through `\9` are supported as follows:

* Captures have in-function context for `sub` and `gsub`. For example, the first `\1,\2` pair belong to the first `sub` and the second `\1,\2` pair belong to the second `sub`:

Expand All @@ -52,6 +52,24 @@ GENMD-SHOW-COMMAND
mlr put '$a =~ "(..)_(....); $b = "left_\1"; $c = "right_\2"'
GENMD-EOF

* Each user-defined function has its own frame for captures. For example:

GENMD-SHOW-COMMAND
mlr -n put '
func f() {
if ("456 defg" =~ "([0-9]+) ([a-z]+)") {
print "INNER: \1 \2";
}
}
end {
if ("123 abc" =~ "([0-9]+) ([a-z]+)") {
print "OUTER PRE: \1 \2";
f();
print "OUTER POST: \1 \2";
}
}'
GENMD-EOF

* The captures are not retained across multiple puts. For example, here the `\1,\2` won't be expanded from the regex capture:

GENMD-SHOW-COMMAND
Expand Down
38 changes: 19 additions & 19 deletions docs/src/reference-verbs.md
Original file line number Diff line number Diff line change
Expand Up @@ -3406,14 +3406,14 @@ fields, optionally categorized by one or more fields.
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
x_y_cov 0.000042574820827444476
x_y_corr 0.0005042001844467462
y_y_cov 0.08461122467974003
x_y_cov 0.00004257482082749404
x_y_corr 0.0005042001844473328
y_y_cov 0.08461122467974005
y_y_corr 1
x2_xy_cov 0.04188382281779374
x2_xy_corr 0.630174342037994
x2_y2_cov -0.00030953725962542085
x2_y2_corr -0.0034249088761121966
x2_xy_cov 0.041883822817793716
x2_xy_corr 0.6301743420379936
x2_y2_cov -0.0003095372596253918
x2_y2_corr -0.003424908876111875
</pre>

<pre class="pre-highlight-in-pair">
Expand All @@ -3422,12 +3422,12 @@ x2_y2_corr -0.0034249088761121966
<b> data/medium</b>
</pre>
<pre class="pre-non-highlight-in-pair">
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266
eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526
wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295
zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495
hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265
a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2
pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127
eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451
wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931
zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495
hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266
</pre>

Here's an example simple line-fit. The `x` and `y`
Expand Down Expand Up @@ -3513,11 +3513,11 @@ upsec_count_pca_quality 0.9999590846136102
donesec 92.33051350964094

color purple
upsec_count_pca_m -39.03009744795354
upsec_count_pca_b 979.9883413064914
upsec_count_pca_m -39.030097447953594
upsec_count_pca_b 979.9883413064917
upsec_count_pca_n 21
upsec_count_pca_quality 0.9999908956206317
donesec 25.10852919630297
donesec 25.108529196302943
</pre>

## step
Expand Down Expand Up @@ -3794,9 +3794,9 @@ distinct_count 5 5 10000 10000 10000
mode pan wye 1 0.3467901443380824 0.7268028627434533
sum 0 0 50005000 4986.019681679581 5062.057444929905
mean - - 5000.5 0.49860196816795804 0.5062057444929905
stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933
var - - 8334166.666666667 0.08426974433144456 0.08461122467974003
skewness - - 0 -0.0006899591185521965 -0.017849760120133784
stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331
var - - 8334166.666666667 0.08426974433144457 0.08461122467974005
skewness - - 0 -0.0006899591185517494 -0.01784976012013298
minlen 3 3 1 15 13
maxlen 3 3 5 22 22
min eks eks 1 0.00004509679127584487 0.00008818962627266114
Expand Down
4 changes: 2 additions & 2 deletions docs/src/two-pass-algorithms.md
Original file line number Diff line number Diff line change
Expand Up @@ -598,8 +598,8 @@ hat pan 0.4643355557376876
x_count 10000
x_sum 4986.019681679581
x_mean 0.49860196816795804
x_var 0.08426974433144456
x_stddev 0.2902925151144007
x_var 0.08426974433144457
x_stddev 0.29029251511440074
</pre>

<pre class="pre-highlight-in-pair">
Expand Down
2 changes: 1 addition & 1 deletion pkg/dsl/cst/builtin_functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ func (node *RegexCaptureBinaryFunctionCallsiteNode) Evaluate(
node.evaluable1.Evaluate(state),
node.evaluable2.Evaluate(state),
)
state.RegexCaptures = captures
state.SetRegexCaptures(captures)
return output
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/dsl/cst/leaves.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ func (node *StringLiteralNode) Evaluate(
// }
//
// the captures can be set (by =~ or !=~) quite far from where they are used.
// This is why we consult the state.RegexCaptures here, to see if they've been
// This is why we consult the state's regex captures here, to see if they've been
// set on some previous invocation of =~ or !=~.
func (node *RegexCaptureReplacementNode) Evaluate(
state *runtime.State,
Expand All @@ -302,7 +302,7 @@ func (node *RegexCaptureReplacementNode) Evaluate(
lib.InterpolateCaptures(
node.replacementString,
node.replacementCaptureMatrix,
state.RegexCaptures,
state.GetRegexCaptures(),
),
)
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/dsl/cst/udf.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ func (site *UDFCallsite) EvaluateWithArguments(
state.Stack.PushStackFrameSet()
defer state.Stack.PopStackFrameSet()
}
state.PushRegexCapturesFrame()
defer state.PopRegexCapturesFrame()

cacheable := !udf.isFunctionLiteral

Expand Down
2 changes: 2 additions & 0 deletions pkg/dsl/cst/uds.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ func (site *UDSCallsite) Execute(state *runtime.State) (*BlockExitPayload, error
// Bind the arguments to the parameters
state.Stack.PushStackFrameSet()
defer state.Stack.PopStackFrameSet()
state.PushRegexCapturesFrame()
defer state.PopRegexCapturesFrame()

for i := range arguments {
err := state.Stack.DefineTypedAtScope(
Expand Down
3 changes: 3 additions & 0 deletions pkg/lib/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ func WriteTempFileOrDie(contents string) string {
}

func CopyStringArray(input []string) []string {
if input == nil {
return nil
}
output := make([]string, len(input))
copy(output, input)
return output
Expand Down
Loading

0 comments on commit 4053d76

Please sign in to comment.