From 8cf15a2e28c11cf5680d2ab8d5ff60f045bb4112 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Tue, 4 Feb 2025 20:58:10 -0500 Subject: [PATCH 1/2] Add more detailed explanations to control-flow RegexOpcode values --- .../Text/RegularExpressions/RegexOpcode.cs | 153 +++++++++++++++--- 1 file changed, 134 insertions(+), 19 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs index d6b5998180a5fb..e5098c8dcc4cec 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs @@ -117,54 +117,169 @@ internal enum RegexOpcode UpdateBumpalong = 46, // Primitive control structures - // TODO: Figure out what these comments mean / what these control structures actually do :) - /// back jump straight first. + /// Lazy branch in an alternation or conditional construct. + /// + /// On first execution, the opcode records the current input position (via the tracking stack) and continues straight + /// without taking the jump. When the matching that follows fails, backtracking will occur and the saved position is restored, + /// at which point the interpreter will jump to the alternative branch (using the patched jump offset in operand 0). + /// This opcode is used to implement alternation in a non-greedy (lazy) manner. + /// Lazybranch = 23, - /// back jump branch first for loop. + + /// Branch in a quantified loop that uses a saved mark to decide whether to repeat or exit. + /// + /// When executed, this opcode pops a previously saved input mark (from a or ) + /// and compares it to the current input position. If the loop's inner expression has consumed input (non-empty match), it + /// pushes updated state (saving the old mark and the current position) and jumps back (via the jump offset in operand 0) + /// to repeat the loop. If no progress has been made (empty match), it records state for backtracking and proceeds. + /// This opcode is used for greedy (non–lazy) quantified loops when no explicit counter is needed. + /// Branchmark = 24, - /// back jump straight first for loop. + + /// Lazy branch in a quantified loop that uses a saved mark. + /// + /// Similar in spirit to , this opcode is used for lazy loops. + /// It initially does not jump back to repeat the loop, preferring to let the overall match continue. + /// However, it saves the loop state so that if subsequent matching fails, backtracking will reenter the loop body. + /// Special care is taken to handle empty matches so as to avoid infinite loops. + /// Lazybranchmark = 25, - /// back val set counter, null mark. + + /// Initialize the loop counter for a quantifier when the minimum repetition is zero. + /// + /// For quantified constructs with a minimum of zero ( == 0), this opcode pushes a counter + /// value (–1) along with a marker (implicitly indicating no match so far) onto the grouping stack. The operand (always 0 + /// in this case) is used in later comparisons within a or opcode. + /// Nullcount = 26, - /// back val set counter, make mark + + /// Initialize the loop counter for a quantifier with a positive minimum. + /// + /// When the quantifier requires at least one match (M > 0), this opcode pushes the current input position as a marker and a + /// counter value computed as (1 – M) onto the grouping stack. This counter will be adjusted in subsequent loop iterations + /// (via or ) to decide whether the loop should continue. + /// Setcount = 27, - /// back jump,limit branch++ if zero<=c<limit. + + /// Greedy counted branch for quantified loops. + /// + /// This opcode is used for quantified loops that require a counter. When executed, it pops the previously stored marker and counter + /// from the grouping stack, computes the difference between the current input position and the marker, and compares the counter + /// against a limit (given in operand 1). If the counter indicates that more iterations are allowed (and the inner expression consumed + /// input), it increments the counter, updates the marker with the new position, and jumps (via the jump offset in operand 0) to + /// repeat the loop. Otherwise, the interpreter continues straight. On backtracking, the previous state is restored so that a decreased + /// count may be tried. + /// Branchcount = 28, - /// back jump,limit same, but straight first. + + /// Lazy counted branch for quantified loops. + /// + /// This opcode is the lazy counterpart to . It is used in quantified loops that use a counter and prefer + /// to exit the loop as early as possible. On initial execution it will choose the straight path (i.e. not repeating the loop) if + /// the counter is nonnegative, but if the inner expression consumed input and the counter is below the maximum (given in operand 1), + /// it will reenter the loop on backtracking. + /// Lazybranchcount = 29, - /// back save position. + + /// Push a null marker into the grouping stack for quantifiers with a minimum of zero when no explicit counter is needed. + /// + /// This opcode is similar to but is used in cases where the quantified construct does not require counting; + /// it pushes a marker value (–1) onto the grouping stack to record the starting position. On backtracking, the marker is simply removed. + /// Nullmark = 30, - /// back save position. + + /// Push the current input position onto the grouping stack. + /// + /// Used by grouping constructs (for capturing or to detect empty matches in loops), this opcode saves the current input position + /// so that later the interpreter can compare it to the current position to decide whether progress was made. It is the non–counting + /// counterpart to . + /// Setmark = 31, - /// back group define group. + + /// Completes a capturing group. + /// + /// When executed, this opcode pops a previously saved marker (the start position of the group) from the grouping stack and uses the + /// current input position as the end position. Operand 0 specifies the capture slot number. If operand 1 is not –1 then a prior capture + /// must have been made and a transfer of capture is performed. On backtracking, the capture is undone. + /// Capturemark = 32, - /// back recall position. + + /// Recall a previously saved marker. + /// + /// This opcode restores the input position from a marker saved on the grouping stack (typically via a or + /// ). It is used in lookaround constructs to revert the input position to the point where the lookaround began. + /// On backtracking, the marker is re–pushed onto the grouping stack. + /// Getmark = 33, - /// back save backtrack state. + + /// Mark the beginning of a non–backtracking / atomic region. + /// + /// This opcode is used at the start of constructs that must not be re–entered on backtracking (such as lookahead/lookbehind or atomic groups). + /// It saves the current backtracking state (including the current tracking and crawl positions) onto the grouping stack. + /// When the region is later exited (by ) the saved state is used to prevent further backtracking into the region. + /// Setjump = 34, - /// zap back to saved state. + + /// Restore state for a non–backtracking / atomic region on backtracking. + /// + /// Used in negative lookaround constructs, this opcode pops the saved backtracking and capture state (stored by a prior ) + /// and erases any changes made within the non–backtracking region. It thereby restores the state to what it was before entering the region. + /// Backjump = 35, - /// zap backtracking state. + + /// Finalize a non–backtracking / atomic region. + /// + /// This opcode is used at the end of lookaround or atomic group constructs to commit to the current matching path. + /// It pops the saved state from the grouping stack (stored by ), updates the tracking pointer (thereby + /// discarding any backtracking state from within the region), and then continues execution. On backtracking from such a region, + /// a variant of this opcode will undo any captures made. + /// Forejump = 36, - /// Backtrack if ref undefined. + + /// Test whether a particular backreference has already matched. + /// + /// Operand 0 is the capture group number to test. When executed, if the specified group has not captured any text, + /// the match fails and control transfers to backtracking. Otherwise, execution continues. This opcode is used in conditional + /// constructs where a branch is taken only if a given capture exists. + /// TestBackreference = 37, - /// jump just go. + + /// Unconditional jump. + /// + /// Operand 0 holds the target offset. When executed, the interpreter jumps unconditionally to that location. + /// This opcode is used to implement control flow for alternation and loop constructs. + /// Goto = 38, - /// done! + + /// Halt the interpreter. + /// + /// This opcode marks the end of the opcode stream. When reached, the matching process terminates and the result + /// (whether a match was found) is returned. + /// Stop = 40, // Modifiers for alternate modes /// Mask to get unmodified ordinary operator. OperatorMask = 63, + /// Indicates that we're reverse scanning. RightToLeft = 64, + /// Indicates that we're backtracking. Backtracking = 128, + /// Indicates that we're backtracking on a second branch. + /// + /// In patterns with alternations or complex quantifiers, multiple backtracking paths may be available. + /// This flag marks opcodes that are being processed on an alternate (or secondary) branch during backtracking, + /// as opposed to the primary branch. The interpreter uses this flag to apply specialized state restoration + /// or branch–selection logic when reverting from one branch to another. + /// BacktrackingSecond = 256, - /// Indicates that we're case-insensitive + + /// Indicates that we're case-insensitive. CaseInsensitive = 512, } } From df14d4ee1348393b69cd26834fa31c3622b0b56d Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 5 Feb 2025 17:28:52 -0500 Subject: [PATCH 2/2] Address feedback --- .../Text/RegularExpressions/RegexOpcode.cs | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs index e5098c8dcc4cec..5f39d5b53a3bb2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs @@ -133,7 +133,7 @@ internal enum RegexOpcode /// and compares it to the current input position. If the loop's inner expression has consumed input (non-empty match), it /// pushes updated state (saving the old mark and the current position) and jumps back (via the jump offset in operand 0) /// to repeat the loop. If no progress has been made (empty match), it records state for backtracking and proceeds. - /// This opcode is used for greedy (non–lazy) quantified loops when no explicit counter is needed. + /// This opcode is used for greedy (non-lazy) quantified loops when no explicit counter is needed. /// Branchmark = 24, @@ -141,7 +141,7 @@ internal enum RegexOpcode /// /// Similar in spirit to , this opcode is used for lazy loops. /// It initially does not jump back to repeat the loop, preferring to let the overall match continue. - /// However, it saves the loop state so that if subsequent matching fails, backtracking will reenter the loop body. + /// However, it saves the loop state so that if subsequent matching fails, backtracking will re-enter the loop body. /// Special care is taken to handle empty matches so as to avoid infinite loops. /// Lazybranchmark = 25, @@ -149,7 +149,7 @@ internal enum RegexOpcode /// Initialize the loop counter for a quantifier when the minimum repetition is zero. /// /// For quantified constructs with a minimum of zero ( == 0), this opcode pushes a counter - /// value (–1) along with a marker (implicitly indicating no match so far) onto the grouping stack. The operand (always 0 + /// value (-1) along with a marker (implicitly indicating no match so far) onto the grouping stack. The operand (always 0 /// in this case) is used in later comparisons within a or opcode. /// Nullcount = 26, @@ -157,7 +157,7 @@ internal enum RegexOpcode /// Initialize the loop counter for a quantifier with a positive minimum. /// /// When the quantifier requires at least one match (M > 0), this opcode pushes the current input position as a marker and a - /// counter value computed as (1 – M) onto the grouping stack. This counter will be adjusted in subsequent loop iterations + /// counter value computed as (1 - M) onto the grouping stack. This counter will be adjusted in subsequent loop iterations /// (via or ) to decide whether the loop should continue. /// Setcount = 27, @@ -178,21 +178,21 @@ internal enum RegexOpcode /// This opcode is the lazy counterpart to . It is used in quantified loops that use a counter and prefer /// to exit the loop as early as possible. On initial execution it will choose the straight path (i.e. not repeating the loop) if /// the counter is nonnegative, but if the inner expression consumed input and the counter is below the maximum (given in operand 1), - /// it will reenter the loop on backtracking. + /// it will re-enter the loop on backtracking. /// Lazybranchcount = 29, /// Push a null marker into the grouping stack for quantifiers with a minimum of zero when no explicit counter is needed. /// /// This opcode is similar to but is used in cases where the quantified construct does not require counting; - /// it pushes a marker value (–1) onto the grouping stack to record the starting position. On backtracking, the marker is simply removed. + /// it pushes a marker value (-1) onto the grouping stack to record the starting position. On backtracking, the marker is simply removed. /// Nullmark = 30, /// Push the current input position onto the grouping stack. /// /// Used by grouping constructs (for capturing or to detect empty matches in loops), this opcode saves the current input position - /// so that later the interpreter can compare it to the current position to decide whether progress was made. It is the non–counting + /// so that later the interpreter can compare it to the current position to decide whether progress was made. It is the non-counting /// counterpart to . /// Setmark = 31, @@ -200,7 +200,7 @@ internal enum RegexOpcode /// Completes a capturing group. /// /// When executed, this opcode pops a previously saved marker (the start position of the group) from the grouping stack and uses the - /// current input position as the end position. Operand 0 specifies the capture slot number. If operand 1 is not –1 then a prior capture + /// current input position as the end position. Operand 0 specifies the capture slot number. If operand 1 is not -1 then a prior capture /// must have been made and a transfer of capture is performed. On backtracking, the capture is undone. /// Capturemark = 32, @@ -209,26 +209,26 @@ internal enum RegexOpcode /// /// This opcode restores the input position from a marker saved on the grouping stack (typically via a or /// ). It is used in lookaround constructs to revert the input position to the point where the lookaround began. - /// On backtracking, the marker is re–pushed onto the grouping stack. + /// On backtracking, the marker is re-pushed onto the grouping stack. /// Getmark = 33, - /// Mark the beginning of a non–backtracking / atomic region. + /// Mark the beginning of a non-backtracking / atomic region. /// - /// This opcode is used at the start of constructs that must not be re–entered on backtracking (such as lookahead/lookbehind or atomic groups). + /// This opcode is used at the start of constructs that must not be re-entered on backtracking (such as lookahead/lookbehind or atomic groups). /// It saves the current backtracking state (including the current tracking and crawl positions) onto the grouping stack. /// When the region is later exited (by ) the saved state is used to prevent further backtracking into the region. /// Setjump = 34, - /// Restore state for a non–backtracking / atomic region on backtracking. + /// Restore state for a non-backtracking / atomic region on backtracking. /// /// Used in negative lookaround constructs, this opcode pops the saved backtracking and capture state (stored by a prior ) - /// and erases any changes made within the non–backtracking region. It thereby restores the state to what it was before entering the region. + /// and erases any changes made within the non-backtracking region. It thereby restores the state to what it was before entering the region. /// Backjump = 35, - /// Finalize a non–backtracking / atomic region. + /// Finalize a non-backtracking / atomic region. /// /// This opcode is used at the end of lookaround or atomic group constructs to commit to the current matching path. /// It pops the saved state from the grouping stack (stored by ), updates the tracking pointer (thereby @@ -275,7 +275,7 @@ internal enum RegexOpcode /// In patterns with alternations or complex quantifiers, multiple backtracking paths may be available. /// This flag marks opcodes that are being processed on an alternate (or secondary) branch during backtracking, /// as opposed to the primary branch. The interpreter uses this flag to apply specialized state restoration - /// or branch–selection logic when reverting from one branch to another. + /// or branch-selection logic when reverting from one branch to another. /// BacktrackingSecond = 256,