Skip to content

Commit

Permalink
vm: use contiguous register storage (nim-works#1179)
Browse files Browse the repository at this point in the history
## Summary

This is an internal-only change. The VM implementation now uses a
register sequence per *thread* rather than per *stack frame*, reducing
the overhead of entering and exiting a procedure for code running in
the VM.

## Details

### VM interface

* allocating the initial stack frame and registers is part of 
`initVmThread`  now
* `compilerbridge` is adjusted accordingly; `execute` now takes a
  `VmThread`, instead of a `TStackFrame`, as input so that the
  callsites can still initialize the parameter registers

### VM architecture

* `VmThread` stores the `seq[TFullReg]` with all registers
* instead of their own `seq[TFullReg]`, each stack frame
  (`TStackFrame`) remembers the index where its register slice begins
* the register sequence is a stack: invoking a procedure grows it,
  while leaving a procedure shrinks it again
* the `.cursor`-based mechanism for keeping a low-overhead reference to
  the current frame's register slice is replaced with `Registers`,
  which is a length + `ptr UncheckedArray` pair (an unsafe first-class
  `openArray`, effectively)
* instead of by an `IndexDefect` being raised, out-of-bounds register
  access is reported via the `VmEvent` facility. This means that out-
  of-bounds register access resulting from ill-formed bytecode no
  longer crashes the VM

Using a single sequence per thread means that there's no more `seq`
allocation/destruction overhead per procedure call, which also means
less pressure on the host's allocator.

### Misc

* a leftover `debugEcho` from some earlier change in `packed_env` is
  removed

---------

Co-authored-by: Saem Ghani <[email protected]>
  • Loading branch information
zerbina and saem authored Feb 11, 2024
1 parent c604a5d commit bd28145
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 69 deletions.
39 changes: 16 additions & 23 deletions compiler/vm/compilerbridge.nim
Original file line number Diff line number Diff line change
Expand Up @@ -271,14 +271,13 @@ proc createLegacyStackTrace(
location: some source(c, thread),
reportInst: toReportLineInfo(instLoc))

proc execute(jit: var JitState, c: var TCtx, start: int, frame: sink TStackFrame;
proc execute(jit: var JitState, c: var TCtx, thread: sink VmThread,
cb: proc(c: TCtx, r: TFullReg): PNode
): ExecutionResult {.inline.} =
## This is the entry point for invoking the VM to execute code at
## compile-time. The `cb` callback is used to deserialize the result stored
## as VM data into ``PNode`` AST, and is invoked with the register that
## holds the result
var thread = initVmThread(c, start, frame)

# run the VM until either no code is left to execute or an event implying
# execution can't go on occurs
Expand All @@ -291,7 +290,7 @@ proc execute(jit: var JitState, c: var TCtx, start: int, frame: sink TStackFrame
"non-static stmt evaluation must produce a value, mode: " & $c.mode
let reg =
if r.reg.isSome:
thread[0].slots[r.reg.get]
thread.regs[r.reg.get]
else:
TFullReg(kind: rkNone)
result.initSuccess cb(c, reg)
Expand Down Expand Up @@ -340,9 +339,8 @@ proc execute(jit: var JitState, c: var TCtx, start: int, frame: sink TStackFrame
dispose(c, thread)

proc execute(jit: var JitState, c: var TCtx, info: CodeInfo): ExecutionResult =
var tos = TStackFrame(prc: nil, comesFrom: 0)
tos.slots.newSeq(info.regCount)
execute(jit, c, info.start, tos,
let thread = initVmThread(c, info.start, info.regCount, nil)
execute(jit, c, thread,
proc(c: TCtx, r: TFullReg): PNode = c.graph.emptyNode)

template returnOnErr(res: VmGenResult, config: ConfigRef, node: PNode): CodeInfo =
Expand Down Expand Up @@ -476,16 +474,14 @@ proc eval(jit: var JitState, c: var TCtx; prc: PSym, n: PNode): PNode =

logBytecode(c, prc, start)

var tos = TStackFrame(prc: prc, comesFrom: 0)
tos.slots.newSeq(regCount)
#for i in 0..<regCount: tos.slots[i] = newNode(nkEmpty)
let cb =
if requiresValue:
mkCallback(c, r): c.regToNode(r, n.typ, n.info)
else:
mkCallback(c, r): newNodeI(nkEmpty, n.info)

result = execute(jit, c, start, tos, cb).unpackResult(c.config, n)
let thread = initVmThread(c, start, regCount, prc)
result = execute(jit, c, thread, cb).unpackResult(c.config, n)

proc evalConstExprAux(module: PSym, idgen: IdGenerator, g: ModuleGraph,
prc: PSym, n: PNode,
Expand Down Expand Up @@ -560,15 +556,13 @@ proc evalMacroCall*(jit: var JitState, c: var TCtx, call, args: PNode,
if not wasAvailable:
logBytecode(c, sym, start)

var tos = TStackFrame(prc: sym, comesFrom: 0)
tos.slots.newSeq(regCount)

var thread = initVmThread(c, start, regCount, sym)
# return value:
tos.slots[0] = TFullReg(kind: rkNimNode, nimNode: newNodeI(nkEmpty, call.info))
thread.regs[0] = TFullReg(kind: rkNimNode, nimNode: newNodeI(nkEmpty, call.info))

# put the normal arguments into registers
for i in 1..<sym.typ.len:
setupMacroParam(tos.slots[i], jit, c, args[i - 1], sym.typ[i])
setupMacroParam(thread.regs[i], jit, c, args[i - 1], sym.typ[i])

# put the generic arguments into registers
let gp = sym.ast[genericParamsPos]
Expand All @@ -577,10 +571,10 @@ proc evalMacroCall*(jit: var JitState, c: var TCtx, call, args: PNode,
# signature
if tfImplicitTypeParam notin gp[i].sym.typ.flags:
let idx = sym.typ.len + i
setupMacroParam(tos.slots[idx], jit, c, args[idx - 1], gp[i].sym.typ)
setupMacroParam(thread.regs[idx], jit, c, args[idx - 1], gp[i].sym.typ)

let cb = mkCallback(c, r): r.nimNode
result = execute(jit, c, start, tos, cb).unpackResult(c.config, call)
result = execute(jit, c, thread, cb).unpackResult(c.config, call)

if result.kind != nkError and cyclicTree(result):
result = c.config.newError(call, PAstDiag(kind: adCyclicTree))
Expand Down Expand Up @@ -633,17 +627,16 @@ proc execProc*(jit: var JitState, c: var TCtx; sym: PSym;
return nil
r.unsafeGet

var tos = TStackFrame(prc: sym, comesFrom: 0)
tos.slots.newSeq(maxSlots)
var thread = initVmThread(c, start, maxSlots, sym)

# setup parameters:
if not isEmptyType(sym.typ[0]) or sym.kind == skMacro:
let typ = c.getOrCreate(sym.typ[0])
if not tos.slots[0].loadEmptyReg(typ, sym.info, c.memory):
tos.slots[0].initLocReg(typ, c.memory)
if not thread.regs[0].loadEmptyReg(typ, sym.info, c.memory):
thread.regs[0].initLocReg(typ, c.memory)
# XXX We could perform some type checking here.
for i in 1..<sym.typ.len:
putIntoReg(tos.slots[i], jit, c, args[i-1], sym.typ[i])
putIntoReg(thread.regs[i], jit, c, args[i-1], sym.typ[i])

let cb =
if not isEmptyType(sym.typ[0]):
Expand All @@ -654,7 +647,7 @@ proc execProc*(jit: var JitState, c: var TCtx; sym: PSym;
else:
mkCallback(c, r): newNodeI(nkEmpty, sym.info)

let r = execute(jit, c, start, tos, cb)
let r = execute(jit, c, thread, cb)
result = r.unpackResult(c.config, c.graph.emptyNode)
reportIfError(c.config, result)
if result.isError:
Expand Down
3 changes: 0 additions & 3 deletions compiler/vm/packed_env.nim
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,9 @@ func storeDataNode(enc: var DataEncoder, e: var PackedEnv,
else:
unreachable(t[n].kind)

import compiler/mir/utils

func storeData*(enc: var DataEncoder, e: var PackedEnv, tree: MirTree): int =
## Packs the MIR constant expression `tree` and puts it into `e`. Returns
## the index of the top data node.
debugEcho treeRepr(tree)
result = enc.i
e.nodes.growBy(1)
storeDataNode(enc, e, tree, NodePosition 0)
Expand Down
105 changes: 70 additions & 35 deletions compiler/vm/vm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ type
## includes things like the program counter, stack frames, active
## exception, etc.
pc: int ## the program counter. Points to the instruction to execute next
regs*: seq[TFullReg]
## all registers beloning to the thread. Each frame owns a slice of it
sframes: seq[TStackFrame] ## stack frames
loopIterations: int
## the number of remaining jumps backwards
Expand Down Expand Up @@ -140,6 +142,12 @@ type
of yrkEcho:
strs*: seq[string] ## strings to be echo'd, at least one item

Registers = object
## A view into the thread's register list. Implements manual index
## checks, so it's a bit less unsafe than a raw pointer-to-unchecked-array.
len: int
data: ptr UncheckedArray[TFullReg]

const
traceCode = defined(nimVMDebugExecute)
fromEhBit = cast[BiggestInt](0x8000_0000_0000_0000'u64)
Expand Down Expand Up @@ -192,6 +200,33 @@ proc createStackTrace*(

assert result.stacktrace.len() <= recursionLimit # post condition check

func initFrom(regs: seq[TFullReg], start: int): Registers =
## Creates a register list view covering `start..regs.high`.
result = Registers(len: regs.len - start)
if regs.len > 0:
result.data = cast[ptr UncheckedArray[TFullReg]](addr regs[start])

func regIndexCheck(r: Registers, i: int) {.inline.} =
# XXX: instead of verifying each register access, it'd be a lot more
# efficient to go over every instruction in a procedure and check the
# register indices once at VM startup
if unlikely(i < 0 or i >= r.len):
raiseVmError(VmEvent(kind: vmEvtErrInternal, msg: "illegal register access"))

template `[]`(r: Registers, i: SomeInteger): TFullReg =
let x = i
regIndexCheck(r, i)
r.data[x]

template `[]=`(r: Registers, i: SomeInteger, val: TFullReg) =
let x = i
regIndexCheck(r, i)
r.data[x] = val

func getReg(t: var VmThread, i: int): var TFullReg {.inline.} =
## Shortcut for accessing the the `i`-th register belonging to the topmost
## stack frame.
t.regs[t.sframes[^1].start + i]

func setNodeValue(dest: LocHandle, node: PNode) =
assert dest.typ.kind == akPNode
Expand Down Expand Up @@ -282,15 +317,16 @@ func cleanUpReg(r: var TFullReg, mm: var VmMemoryManager) =
resetLocation(mm, r.handle.byteView(), r.handle.typ)
mm.allocator.dealloc(r.handle)

proc cleanUpLocations(mm: var VmMemoryManager, frame: var TStackFrame) =
## Cleans up and deallocates all locations belonging to `frame`. Registers
## are left in an invalid state, as this function is meant to be called
## prior to leaving a frame
for s in frame.slots.items:
proc cleanUpLocations(mm: var VmMemoryManager, regs: var seq[TFullReg],
start: int) =
## Cleans up and frees all registers beyond and including `start`.
for s in regs.toOpenArray(start, regs.high):
if s.kind == rkLocation:
mm.resetLocation(s.handle.byteView(), s.handle.typ)
mm.allocator.dealloc(s.handle)

regs.setLen(start)

func cleanUpPending(mm: var VmMemoryManager) =
## Cleans up all managed ref-counted locations marked for clean-up.
# process the list back-to-front, reducing the amount of seq resizing when
Expand Down Expand Up @@ -572,7 +608,7 @@ proc runEh(t: var VmThread, c: var TCtx): Result[PrgCtr, VmException] =
# ``finally``
let instr = c.code[instr.b]
vmAssert instr.opcode == opcFinallyEnd
let (fromEh, b) = decodeControl(t.sframes[^1].slots[instr.regA].intVal)
let (fromEh, b) = decodeControl(t.getReg(instr.regA).intVal)
if fromEh:
vmAssert b.int == t.ehStack.high - 1
swap(tos, t.ehStack[^2])
Expand All @@ -597,9 +633,9 @@ proc resumeEh(c: var TCtx, t: var VmThread,
if r.isOk:
# an exception handler or finalizer is entered. Unwind to the target
# frame:
for j in (frame+1)..<t.sframes.len:
cleanUpLocations(c.memory, t.sframes[j])
t.sframes.setLen(frame + 1)
if frame < t.sframes.len - 1:
cleanUpLocations(c.memory, t.regs, t.sframes[frame+1].start)
t.sframes.setLen(frame + 1)
# return control to the VM:
return r
elif frame == 0:
Expand Down Expand Up @@ -639,7 +675,7 @@ proc handle(res: sink Result[PrgCtr, VmException], c: var TCtx,
if c.code[result].opcode == opcFinally:
# setup the finally section's control register
let reg = c.code[result].regA
t.sframes[^1].slots[reg].initIntReg(fromEhBit or t.ehStack.high, c.memory)
t.getReg(reg).initIntReg(fromEhBit or t.ehStack.high, c.memory)
inc result

else:
Expand Down Expand Up @@ -846,18 +882,11 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =
## instruction. If the loop exits without errors, `pc` points to the last
## executed instruction.

when defined(gcArc) or defined(gcOrc):
# Use {.cursor.} as a way to get a shallow copy of the seq. This is safe,
# since `slots` is never changed in length (no add/delete)
var regs {.cursor.}: seq[TFullReg]
template updateRegsAlias =
regs = t.sframes[^1].slots
updateRegsAlias
else:
var regs: seq[TFullReg] # alias to tos.slots for performance
template updateRegsAlias =
shallowCopy(regs, t.sframes[^1].slots)
updateRegsAlias
var regs: Registers
## view into current active frame's register slice
template updateRegsAlias =
regs = initFrom(t.regs, t.sframes[^1].start)
updateRegsAlias()

# alias templates to shorten common expressions:
template tos: untyped =
Expand All @@ -869,7 +898,7 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =
updateRegsAlias()

template popFrame() =
cleanUpLocations(c.memory, t.sframes[tos])
cleanUpLocations(c.memory, t.regs, t.sframes[tos].start)
t.sframes.setLen(t.sframes.len - 1)

updateRegsAlias()
Expand Down Expand Up @@ -935,7 +964,7 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =
# value or the destination handle) to the destination register on the
# caller's frame
let i = c.code[pc].regA
t.sframes[tos - 1].slots[i] = move regs[0]
t.regs[t.sframes[tos - 1].start + i] = move regs[0]

popFrame()
of opcYldYoid: assert false
Expand Down Expand Up @@ -1977,7 +2006,7 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =
checkHandle(regs[i])

c.callbacks[entry.cbOffset](
VmArgs(ra: ra, rb: rb, rc: rc, slots: cast[ptr UncheckedArray[TFullReg]](addr regs[0]),
VmArgs(ra: ra, rb: rb, rc: rc, slots: regs.data,
currentExceptionPtr: addr t.currentException,
currentLineInfo: c.debug[pc],
typeCache: addr c.typeInfoCache,
Expand All @@ -2003,19 +2032,22 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =
# logic as for loops:
if newPc < pc: handleJmpBack()
#echo "new pc ", newPc, " calling: ", prc.name.s
var newFrame = TStackFrame(prc: prc, comesFrom: pc)
newFrame.slots.newSeq(regCount)
let start = t.regs.len
var newFrame = TStackFrame(prc: prc, comesFrom: pc, start: start)
# make space for the registers and refresh the view:
t.regs.setLen(start + regCount)
updateRegsAlias()
if instr.opcode == opcIndCallAsgn:
# the destination might be a temporary complex location (`ra` is an
# ``rkLocation`` register then). While we could use
# ``fastAsgnComplex`` like we do with the arguments, it would mean
# that each result access is subjected to access checks. That's
# inefficient, so we *move* (destructive) the register's content for
# the duration of the call and move it back when the call returns
newFrame.slots[0] = move regs[ra]
t.regs[start + 0] = move regs[ra]

for i in 1..<rc:
newFrame.slots[i].fastAsgnComplex(regs[rb+i])
t.regs[start + i].fastAsgnComplex(regs[rb+i])

pushFrame(newFrame)
# -1 for the following 'inc pc'
Expand Down Expand Up @@ -2912,17 +2944,20 @@ proc rawExecute(c: var TCtx, t: var VmThread, pc: var int): YieldReason =

proc `=copy`*(x: var VmThread, y: VmThread) {.error.}

proc initVmThread*(c: var TCtx, pc: int, frame: sink TStackFrame): VmThread =
## Sets up a ``VmThread`` instance that will start execution at `pc`.
## `frame` provides the initial stack frame.
proc initVmThread*(c: var TCtx, pc: PrgCtr, numRegisters: int,
sym: PSym): VmThread =
## Sets up a `VmThread <#VmThread>`_ instance that will start execution at
## `pc` and that has `numRegisters` as the initial amount of registers.
## `sym` is the symbol to associate the initial stack-frame with. It may be
## nil.
VmThread(pc: pc,
regs: newSeq[TFullReg](numRegisters),
loopIterations: c.config.maxLoopIterationsVM,
sframes: @[frame])
sframes: @[TStackFrame(prc: sym)])

proc dispose*(c: var TCtx, t: sink VmThread) =
## Cleans up and frees all VM data owned by `t`.
for f in t.sframes.mitems:
c.memory.cleanUpLocations(f)
c.memory.cleanUpLocations(t.regs, 0)

if t.currentException.isNotNil:
c.heap.heapDecRef(c.allocator, t.currentException)
Expand Down
5 changes: 3 additions & 2 deletions compiler/vm/vmdef.nim
Original file line number Diff line number Diff line change
Expand Up @@ -734,8 +734,9 @@ type

TStackFrame* = object
prc*: PSym # current prc; proc that is evaluated
slots*: seq[TFullReg] # parameters passed to the proc + locals;
# parameters come first
start*: int
## position in the thread's register sequence where the registers for
## the frame start
eh*: HOslice[int]
## points to the active list of instruction-to-EH mappings
baseOffset*: PrgCtr
Expand Down
9 changes: 3 additions & 6 deletions compiler/vm/vmrunner.nim
Original file line number Diff line number Diff line change
Expand Up @@ -375,14 +375,11 @@ proc main*(args: seq[string]): int =
let
entryPoint = c.functions[lr.unsafeGet.int]

# setup the starting frame:
var frame = TStackFrame(prc: entryPoint.sym)
frame.slots.newSeq(entryPoint.regCount)

# the execution part. Set up a thread and run it until it either exits
# normally or abnormally
var
thread = initVmThread(c, entryPoint.start, frame)
thread = initVmThread(c, entryPoint.start, entryPoint.regCount.int,
entryPoint.sym)
continueExecution = true ## whether we continue to execute after yield
while continueExecution:
continueExecution = false # default to stop execution on any yield
Expand All @@ -391,7 +388,7 @@ proc main*(args: seq[string]): int =
of yrkDone:
# on successful execution, the executable's main function returns the
# value of ``programResult``, which we use as the runner's exit code
let reg = thread[0].slots[r.reg.get]
let reg = thread.regs[r.reg.get]
result = regToNode(c, reg, nil, TLineInfo()).intVal.int
of yrkError:
# an uncaught error occurred
Expand Down

0 comments on commit bd28145

Please sign in to comment.