Skip to content

Commit

Permalink
cmd/compile/internal: stack slot merging region formation enhancements
Browse files Browse the repository at this point in the history
This patch revises the algorithm/strategy used for overlapping the
stack slots of disjointly accessed local variables. The main change
here is to allow merging the stack slot of B into the slot for A if
B's size is less then A (prior to this they had to be identical), and
to also allow merging a non-pointer variables into pointer-variable
slots.

The new algorithm sorts the candidate list first by pointerness
(pointer variables first), then by alignment, then by size, and
finally by name. We no longer check that two variables have the same
GC shape before merging: since it should never be the case that we
have two vars X and Y both live across a given callsite where X and Y
share a stack slot, their gc shape doesn't matter.

Doing things this new way increases the total number of bytes saved
(across all functions) from 91256 to 124336 for the sweet benchmarks.

Updates #62737.
Updates #65532.
Updates #65495.

Change-Id: I1daaac1b1240aa47a6975e98ccd24e03304ab602
Reviewed-on: https://go-review.googlesource.com/c/go/+/577615
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Cherry Mui <[email protected]>
  • Loading branch information
thanm committed Apr 18, 2024
1 parent a973b42 commit e01b1eb
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 141 deletions.
204 changes: 110 additions & 94 deletions src/cmd/compile/internal/liveness/mergelocals.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ import (
"cmd/compile/internal/base"
"cmd/compile/internal/bitvec"
"cmd/compile/internal/ir"
"cmd/compile/internal/reflectdata"
"cmd/compile/internal/ssa"
"cmd/internal/obj"
"cmd/internal/src"
"fmt"
"os"
Expand All @@ -23,12 +21,14 @@ import (
// (stack-allocated) variables within a function can be safely
// merged/overlapped, e.g. share a stack slot with some other auto).
// An instance of MergeLocalsState is produced by MergeLocals() below
// and then consumed in ssagen.AllocFrame. The map 'partition' contains
// entries of the form <N,SL> where N is an *ir.Name and SL is a slice
// holding the indices (within 'vars') of other variables that share the
// same slot. For example, if a function contains five variables where
// v1/v2/v3 are safe to overlap and v4/v5 are safe to overlap, the
// MergeLocalsState content might look like
// and then consumed in ssagen.AllocFrame. The map 'partition'
// contains entries of the form <N,SL> where N is an *ir.Name and SL
// is a slice holding the indices (within 'vars') of other variables
// that share the same slot, specifically the slot of the first
// element in the partition, which we'll call the "leader". For
// example, if a function contains five variables where v1/v2/v3 are
// safe to overlap and v4/v5 are safe to overlap, the MergeLocalsState
// content might look like
//
// vars: [v1, v2, v3, v4, v5]
// partition: v1 -> [1, 0, 2], v2 -> [1, 0, 2], v3 -> [1, 0, 2]
Expand All @@ -49,6 +49,22 @@ type candRegion struct {
st, en int
}

// cstate holds state information we'll need during the analysis
// phase of stack slot merging but can be discarded when the analysis
// is done.
type cstate struct {
fn *ir.Func
f *ssa.Func
lv *liveness
cands []*ir.Name
nameToSlot map[*ir.Name]int32
regions []candRegion
indirectUE map[ssa.ID][]*ir.Name
ivs []Intervals
hashDeselected map[*ir.Name]bool
trace int // debug trace level
}

// MergeLocals analyzes the specified ssa function f to determine which
// of its auto variables can safely share the same stack slot, returning
// a state object that describes how the overlap should be done.
Expand Down Expand Up @@ -223,6 +239,19 @@ func (mls *MergeLocalsState) check() error {
if !foundk {
return fmt.Errorf("k=%s v=+%v slice value missing k", k.Sym().Name, sl)
}
vl := mls.vars[sl[0]]
for _, v := range sl[1:] {
vv := mls.vars[v]
if vv.Type().Size() > vl.Type().Size() {
return fmt.Errorf("k=%s v=+%v follower %s size %d larger than leader %s size %d", k.Sym().Name, sl, vv.Sym().Name, vv.Type().Size(), vl.Sym().Name, vl.Type().Size())
}
if vv.Type().HasPointers() && !vl.Type().HasPointers() {
return fmt.Errorf("k=%s v=+%v follower %s hasptr=true but leader %s hasptr=false", k.Sym().Name, sl, vv.Sym().Name, vl.Sym().Name)
}
if vv.Type().Alignment() > vl.Type().Alignment() {
return fmt.Errorf("k=%s v=+%v follower %s align %d greater than leader %s align %d", k.Sym().Name, sl, vv.Sym().Name, vv.Type().Alignment(), vl.Sym().Name, vl.Type().Alignment())
}
}
}
for i := range used {
if !used[i] {
Expand Down Expand Up @@ -296,14 +325,13 @@ func (cs *cstate) collectMergeCandidates() {

// Now generate an initial pruned candidate list and regions list.
// This may be empty if we don't have enough compatible candidates.
initial, _ := genRegions(cands)
initial, _ := cs.genRegions(cands)
if len(initial) < 2 {
return
}

// When bisecting it can be handy to see debug trace output for
// only those functions that hashdebug selects; set this up here.
cs.setupHashTrace(initial)
// Set up for hash bisection if enabled.
cs.setupHashBisection(initial)

// Create and populate an indirect use table that we'll use
// during interval construction. As part of this process we may
Expand All @@ -330,7 +358,9 @@ func (cs *cstate) collectMergeCandidates() {
}
}

func genRegions(cands []*ir.Name) ([]*ir.Name, []candRegion) {
// genRegions generates a set of regions within cands corresponding
// to potentially overlappable/mergeable variables.
func (cs *cstate) genRegions(cands []*ir.Name) ([]*ir.Name, []candRegion) {
var pruned []*ir.Name
var regions []candRegion
st := 0
Expand All @@ -346,8 +376,8 @@ func genRegions(cands []*ir.Name) ([]*ir.Name, []candRegion) {
}
pst := len(pruned)
pen := pst + (en - st)
if base.Debug.MergeLocalsTrace > 1 {
fmt.Fprintf(os.Stderr, "=-= add part %d -> %d\n", pst, pen)
if cs.trace > 1 {
fmt.Fprintf(os.Stderr, "=-= addregion st=%d en=%d: add part %d -> %d\n", st, en, pst, pen)
}

// non-empty region, add to pruned
Expand Down Expand Up @@ -385,27 +415,29 @@ func (cs *cstate) dumpFuncIfSelected() {
cs.dumpFunc()
}

func (cs *cstate) setupHashTrace(cands []*ir.Name) {
if base.Debug.MergeLocalsHTrace == 0 || base.Debug.MergeLocalsHash == "" {
// setupHashBisection checks to see if any of the candidate
// variables have been de-selected by our hash debug. Here
// we also implement the -d=mergelocalshtrace flag, which turns
// on debug tracing only if we have at least two candidates
// selected by the hash debug for this function.
func (cs *cstate) setupHashBisection(cands []*ir.Name) {
if base.Debug.MergeLocalsHash == "" {
return
}

// With this trace variant, check to see whether any of the
// candidates are selected-- if yes then enable tracing. Hack:
// create a new hashdebug with verbosity turned off and use that
// to test, so as not to confuse bisect.
modified := strings.ReplaceAll(base.Debug.MergeLocalsHash, "v", "q")
quiethd := base.NewHashDebug("qmergelocals", modified, nil)
found := false
deselected := make(map[*ir.Name]bool)
selCount := 0
for _, cand := range cands {
if !quiethd.MatchPosWithInfo(cand.Pos(), "quiet", nil) {
found = true
fmt.Fprintf(os.Stderr, "=-= MergeLocalsHTrace fn=%v n=%s match\n",
cs.fn, cand.Sym().Name)
break
if !base.MergeLocalsHash.MatchPosWithInfo(cand.Pos(), "mergelocals", nil) {
deselected[cand] = true
} else {
deselected[cand] = false
selCount++
}
}
if found {
if selCount < len(cands) {
cs.hashDeselected = deselected
}
if base.Debug.MergeLocalsHTrace != 0 && selCount >= 2 {
cs.trace = base.Debug.MergeLocalsHTrace
}
}
Expand Down Expand Up @@ -566,7 +598,7 @@ func (cs *cstate) populateIndirectUseTable(cands []*ir.Name) ([]*ir.Name, []cand
return nameLess(pruned[i], pruned[j])
})
var regions []candRegion
pruned, regions = genRegions(pruned)
pruned, regions = cs.genRegions(pruned)
if len(pruned) < 2 {
return nil, nil
}
Expand All @@ -586,29 +618,30 @@ type nameCount struct {
count int32
}

// nameLess compares ci with cj to see if ci should be less than cj
// in a relative ordering of candidate variables. This is used to
// sort vars by size, pointerness, and GC shape.
// nameLess compares ci with cj to see if ci should be less than cj in
// a relative ordering of candidate variables. This is used to sort
// vars by pointerness (variables with pointers first), then in order
// of decreasing alignment, then by decreasing size. We are assuming a
// merging algorithm that merges later entries in the list into
// earlier entries. An example ordered candidate list produced by
// nameLess:
//
// idx name type align size
// 0: abc [10]*int 8 80
// 1: xyz [9]*int 8 72
// 2: qrs [2]*int 8 16
// 3: tuv [9]int 8 72
// 4: wxy [9]int32 4 36
// 5: jkl [8]int32 4 32
func nameLess(ci, cj *ir.Name) bool {
ihp, jhp := 0, 0
var ilsym, jlsym *obj.LSym
if ci.Type().HasPointers() {
ihp = 1
ilsym, _, _ = reflectdata.GCSym(ci.Type())
if ci.Type().HasPointers() != cj.Type().HasPointers() {
return ci.Type().HasPointers()
}
if cj.Type().HasPointers() {
jhp = 1
jlsym, _, _ = reflectdata.GCSym(cj.Type())
}
if ihp != jhp {
return ihp < jhp
if ci.Type().Alignment() != cj.Type().Alignment() {
return cj.Type().Alignment() < ci.Type().Alignment()
}
if ci.Type().Size() != cj.Type().Size() {
return ci.Type().Size() < cj.Type().Size()
}
if ihp != 0 && jhp != 0 && ilsym != jlsym {
// FIXME: find less clunky way to do this
return fmt.Sprintf("%v", ilsym) < fmt.Sprintf("%v", jlsym)
return cj.Type().Size() < ci.Type().Size()
}
if ci.Sym().Name != cj.Sym().Name {
return ci.Sym().Name < cj.Sym().Name
Expand All @@ -617,63 +650,48 @@ func nameLess(ci, cj *ir.Name) bool {
}

// nextRegion starts at location idx and walks forward in the cands
// slice looking for variables that are "compatible" (overlappable)
// with the variable at position idx; it returns the end of the new
// region (range of compatible variables starting at idx).
// slice looking for variables that are "compatible" (potentially
// overlappable, in the sense that they could potentially share the
// stack slot of cands[idx]); it returns the end of the new region
// (range of compatible variables starting at idx).
func nextRegion(cands []*ir.Name, idx int) int {
n := len(cands)
if idx >= n {
return -1
}
c0 := cands[idx]
hp0 := c0.Type().HasPointers()
szprev := c0.Type().Size()
alnprev := c0.Type().Alignment()
for j := idx + 1; j < n; j++ {
cj := cands[j]
hpj := cj.Type().HasPointers()
ok := true
if hp0 {
if !hpj || c0.Type().Size() != cj.Type().Size() {
return j - 1
}
// GC shape must match if both types have pointers.
gcsym0, _, _ := reflectdata.GCSym(c0.Type())
gcsymj, _, _ := reflectdata.GCSym(cj.Type())
if gcsym0 != gcsymj {
return j - 1
}
} else {
// If no pointers, match size only.
if !ok || hp0 != hpj || c0.Type().Size() != cj.Type().Size() {
return j - 1
}
szj := cj.Type().Size()
if szj > szprev {
return j - 1
}
alnj := cj.Type().Alignment()
if alnj > alnprev {
return j - 1
}
szprev = szj
alnprev = alnj
}
return n - 1
}

// cstate holds state information we'll need during the analysis
// phase of stack slot merging but can be discarded when the analysis
// is done.
type cstate struct {
fn *ir.Func
f *ssa.Func
lv *liveness
cands []*ir.Name
nameToSlot map[*ir.Name]int32
regions []candRegion
indirectUE map[ssa.ID][]*ir.Name
ivs []Intervals
trace int // debug trace level
}

// mergeVisitRegion tries to perform overlapping of variables with a
// given subrange of cands described by st and en (indices into our
// candidate var list), where the variables within this range have
// already been determined to be compatible with respect to type,
// size, etc. Overlapping is done in a a greedy fashion: we select the
// first element in the st->en range, then walk the rest of the
// elements adding in vars whose lifetimes don't overlap with the
// first element, then repeat the process until we run out of work to do.
// first element, then repeat the process until we run out of work.
// Ordering of the candidates within the region [st,en] is important;
// within the list the assumption is that if we overlap two variables
// X and Y where X precedes Y in the list, we need to make X the
// "leader" (keep X's slot and set Y's frame offset to X's) as opposed
// to the other way around, since it's possible that Y is smaller in
// size than X.
func (cs *cstate) mergeVisitRegion(mls *MergeLocalsState, st, en int) {
if cs.trace > 1 {
fmt.Fprintf(os.Stderr, "=-= mergeVisitRegion(st=%d, en=%d)\n", st, en)
Expand Down Expand Up @@ -712,10 +730,8 @@ func (cs *cstate) mergeVisitRegion(mls *MergeLocalsState, st, en int) {
for succ := nxt(leader + 1); succ != -1; succ = nxt(succ + 1) {

// Skip if de-selected by merge locals hash.
if base.Debug.MergeLocalsHash != "" {
if !base.MergeLocalsHash.MatchPosWithInfo(cands[succ].Pos(), "mergelocals", nil) {
continue
}
if cs.hashDeselected != nil && cs.hashDeselected[cands[succ]] {
continue
}
// Skip if already used.
if used.Get(int32(succ - st)) {
Expand Down Expand Up @@ -1004,9 +1020,9 @@ func fmtFullPos(p src.XPos) string {
}

func dumpCand(c *ir.Name, i int) {
fmt.Fprintf(os.Stderr, " %d: %s %q sz=%d hp=%v t=%v\n",
fmt.Fprintf(os.Stderr, " %d: %s %q sz=%d hp=%v align=%d t=%v\n",
i, fmtFullPos(c.Pos()), c.Sym().Name, c.Type().Size(),
c.Type().HasPointers(), c.Type())
c.Type().HasPointers(), c.Type().Alignment(), c.Type())
}

// for unit testing only.
Expand Down
Loading

0 comments on commit e01b1eb

Please sign in to comment.