diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs index d6b5998180a5fb..5f39d5b53a3bb2 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexOpcode.cs @@ -117,54 +117,169 @@ internal enum RegexOpcode UpdateBumpalong = 46, // Primitive control structures - // TODO: Figure out what these comments mean / what these control structures actually do :) - /// back jump straight first. + /// Lazy branch in an alternation or conditional construct. + /// + /// On first execution, the opcode records the current input position (via the tracking stack) and continues straight + /// without taking the jump. When the matching that follows fails, backtracking will occur and the saved position is restored, + /// at which point the interpreter will jump to the alternative branch (using the patched jump offset in operand 0). + /// This opcode is used to implement alternation in a non-greedy (lazy) manner. + /// Lazybranch = 23, - /// back jump branch first for loop. + + /// Branch in a quantified loop that uses a saved mark to decide whether to repeat or exit. + /// + /// When executed, this opcode pops a previously saved input mark (from a or ) + /// and compares it to the current input position. If the loop's inner expression has consumed input (non-empty match), it + /// pushes updated state (saving the old mark and the current position) and jumps back (via the jump offset in operand 0) + /// to repeat the loop. If no progress has been made (empty match), it records state for backtracking and proceeds. + /// This opcode is used for greedy (non-lazy) quantified loops when no explicit counter is needed. + /// Branchmark = 24, - /// back jump straight first for loop. + + /// Lazy branch in a quantified loop that uses a saved mark. + /// + /// Similar in spirit to , this opcode is used for lazy loops. + /// It initially does not jump back to repeat the loop, preferring to let the overall match continue. + /// However, it saves the loop state so that if subsequent matching fails, backtracking will re-enter the loop body. + /// Special care is taken to handle empty matches so as to avoid infinite loops. + /// Lazybranchmark = 25, - /// back val set counter, null mark. + + /// Initialize the loop counter for a quantifier when the minimum repetition is zero. + /// + /// For quantified constructs with a minimum of zero ( == 0), this opcode pushes a counter + /// value (-1) along with a marker (implicitly indicating no match so far) onto the grouping stack. The operand (always 0 + /// in this case) is used in later comparisons within a or opcode. + /// Nullcount = 26, - /// back val set counter, make mark + + /// Initialize the loop counter for a quantifier with a positive minimum. + /// + /// When the quantifier requires at least one match (M > 0), this opcode pushes the current input position as a marker and a + /// counter value computed as (1 - M) onto the grouping stack. This counter will be adjusted in subsequent loop iterations + /// (via or ) to decide whether the loop should continue. + /// Setcount = 27, - /// back jump,limit branch++ if zero<=c<limit. + + /// Greedy counted branch for quantified loops. + /// + /// This opcode is used for quantified loops that require a counter. When executed, it pops the previously stored marker and counter + /// from the grouping stack, computes the difference between the current input position and the marker, and compares the counter + /// against a limit (given in operand 1). If the counter indicates that more iterations are allowed (and the inner expression consumed + /// input), it increments the counter, updates the marker with the new position, and jumps (via the jump offset in operand 0) to + /// repeat the loop. Otherwise, the interpreter continues straight. On backtracking, the previous state is restored so that a decreased + /// count may be tried. + /// Branchcount = 28, - /// back jump,limit same, but straight first. + + /// Lazy counted branch for quantified loops. + /// + /// This opcode is the lazy counterpart to . It is used in quantified loops that use a counter and prefer + /// to exit the loop as early as possible. On initial execution it will choose the straight path (i.e. not repeating the loop) if + /// the counter is nonnegative, but if the inner expression consumed input and the counter is below the maximum (given in operand 1), + /// it will re-enter the loop on backtracking. + /// Lazybranchcount = 29, - /// back save position. + + /// Push a null marker into the grouping stack for quantifiers with a minimum of zero when no explicit counter is needed. + /// + /// This opcode is similar to but is used in cases where the quantified construct does not require counting; + /// it pushes a marker value (-1) onto the grouping stack to record the starting position. On backtracking, the marker is simply removed. + /// Nullmark = 30, - /// back save position. + + /// Push the current input position onto the grouping stack. + /// + /// Used by grouping constructs (for capturing or to detect empty matches in loops), this opcode saves the current input position + /// so that later the interpreter can compare it to the current position to decide whether progress was made. It is the non-counting + /// counterpart to . + /// Setmark = 31, - /// back group define group. + + /// Completes a capturing group. + /// + /// When executed, this opcode pops a previously saved marker (the start position of the group) from the grouping stack and uses the + /// current input position as the end position. Operand 0 specifies the capture slot number. If operand 1 is not -1 then a prior capture + /// must have been made and a transfer of capture is performed. On backtracking, the capture is undone. + /// Capturemark = 32, - /// back recall position. + + /// Recall a previously saved marker. + /// + /// This opcode restores the input position from a marker saved on the grouping stack (typically via a or + /// ). It is used in lookaround constructs to revert the input position to the point where the lookaround began. + /// On backtracking, the marker is re-pushed onto the grouping stack. + /// Getmark = 33, - /// back save backtrack state. + + /// Mark the beginning of a non-backtracking / atomic region. + /// + /// This opcode is used at the start of constructs that must not be re-entered on backtracking (such as lookahead/lookbehind or atomic groups). + /// It saves the current backtracking state (including the current tracking and crawl positions) onto the grouping stack. + /// When the region is later exited (by ) the saved state is used to prevent further backtracking into the region. + /// Setjump = 34, - /// zap back to saved state. + + /// Restore state for a non-backtracking / atomic region on backtracking. + /// + /// Used in negative lookaround constructs, this opcode pops the saved backtracking and capture state (stored by a prior ) + /// and erases any changes made within the non-backtracking region. It thereby restores the state to what it was before entering the region. + /// Backjump = 35, - /// zap backtracking state. + + /// Finalize a non-backtracking / atomic region. + /// + /// This opcode is used at the end of lookaround or atomic group constructs to commit to the current matching path. + /// It pops the saved state from the grouping stack (stored by ), updates the tracking pointer (thereby + /// discarding any backtracking state from within the region), and then continues execution. On backtracking from such a region, + /// a variant of this opcode will undo any captures made. + /// Forejump = 36, - /// Backtrack if ref undefined. + + /// Test whether a particular backreference has already matched. + /// + /// Operand 0 is the capture group number to test. When executed, if the specified group has not captured any text, + /// the match fails and control transfers to backtracking. Otherwise, execution continues. This opcode is used in conditional + /// constructs where a branch is taken only if a given capture exists. + /// TestBackreference = 37, - /// jump just go. + + /// Unconditional jump. + /// + /// Operand 0 holds the target offset. When executed, the interpreter jumps unconditionally to that location. + /// This opcode is used to implement control flow for alternation and loop constructs. + /// Goto = 38, - /// done! + + /// Halt the interpreter. + /// + /// This opcode marks the end of the opcode stream. When reached, the matching process terminates and the result + /// (whether a match was found) is returned. + /// Stop = 40, // Modifiers for alternate modes /// Mask to get unmodified ordinary operator. OperatorMask = 63, + /// Indicates that we're reverse scanning. RightToLeft = 64, + /// Indicates that we're backtracking. Backtracking = 128, + /// Indicates that we're backtracking on a second branch. + /// + /// In patterns with alternations or complex quantifiers, multiple backtracking paths may be available. + /// This flag marks opcodes that are being processed on an alternate (or secondary) branch during backtracking, + /// as opposed to the primary branch. The interpreter uses this flag to apply specialized state restoration + /// or branch-selection logic when reverting from one branch to another. + /// BacktrackingSecond = 256, - /// Indicates that we're case-insensitive + + /// Indicates that we're case-insensitive. CaseInsensitive = 512, } }