cmd/compile: optimize liveness in stackalloc
The stackalloc code needs to run a liveness pass to build the
interference graph between stack slots. Because the values that we need
liveness over is so sparse, we can optimize the analysis by using a path
exploration algorithm rather than a iterative dataflow one
In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile.
diff --git a/src/cmd/compile/internal/ssa/stackalloc.go b/src/cmd/compile/internal/ssa/stackalloc.go
index 11ffe5b..eabdba3 100644
--- a/src/cmd/compile/internal/ssa/stackalloc.go
+++ b/src/cmd/compile/internal/ssa/stackalloc.go
@@ -56,10 +56,36 @@
}
type stackValState struct {
- typ *types.Type
- spill *Value
- needSlot bool
- isArg bool
+ typ *types.Type
+ spill *Value
+ needSlot bool
+ isArg bool
+ defBlock ID
+ useBlocks []stackUseBlock
+}
+
+// addUseBlock adds a block to the set of blocks that uses this value.
+// The set property is loosely enforced. We append a block if the
+// last block was a different block. Because we add values block by block
+// (barring phi-nodes), this approximates a set. It is ok for useBlocks
+// to have duplicates, since we will just run right through them when we
+// perform the liveness analysis
+func (sv *stackValState) addUseBlock(b *Block, liveout bool) {
+ entry := stackUseBlock{
+ b: b,
+ liveout: liveout,
+ }
+ if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry {
+ sv.useBlocks = append(sv.useBlocks, stackUseBlock{
+ b: b,
+ liveout: liveout,
+ })
+ }
+}
+
+type stackUseBlock struct {
+ b *Block
+ liveout bool
}
// stackalloc allocates storage in the stack frame for
@@ -99,6 +125,7 @@
s.values[v.ID].typ = v.Type
s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack
s.values[v.ID].isArg = hasAnyArgOp(v)
+ s.values[v.ID].defBlock = b.ID
if f.pass.debug > stackDebug && s.values[v.ID].needSlot {
fmt.Printf("%s needs a stack slot\n", v)
}
@@ -291,80 +318,83 @@
// computeLive computes a map from block ID to a list of
// stack-slot-needing value IDs live at the end of that block.
-// TODO: this could be quadratic if lots of variables are live across lots of
-// basic blocks. Figure out a way to make this function (or, more precisely, the user
-// of this function) require only linear size & time.
+//
+// Because values using stack slots are few and far inbetween
+// (compared to the set of all values), we use a path exploration
+// algorithm to calculate liveness here.
func (s *stackAllocState) computeLive(spillLive [][]ID) {
- s.live = make([][]ID, s.f.NumBlocks())
- var phis []*Value
- live := s.f.newSparseSet(s.f.NumValues())
- defer s.f.retSparseSet(live)
- t := s.f.newSparseSet(s.f.NumValues())
- defer s.f.retSparseSet(t)
-
- // Instead of iterating over f.Blocks, iterate over their postordering.
- // Liveness information flows backward, so starting at the end
- // increases the probability that we will stabilize quickly.
- po := s.f.postorder()
- for {
- changed := false
- for _, b := range po {
- // Start with known live values at the end of the block
- live.clear()
- live.addAll(s.live[b.ID])
-
- // Propagate backwards to the start of the block
- phis = phis[:0]
- for i := len(b.Values) - 1; i >= 0; i-- {
- v := b.Values[i]
- live.remove(v.ID)
- if v.Op == OpPhi {
- // Save phi for later.
- // Note: its args might need a stack slot even though
- // the phi itself doesn't. So don't use needSlot.
- if !v.Type.IsMemory() && !v.Type.IsVoid() {
- phis = append(phis, v)
- }
- continue
- }
- for _, a := range v.Args {
- if s.values[a.ID].needSlot {
- live.add(a.ID)
- }
- }
- }
-
- // for each predecessor of b, expand its list of live-at-end values
- // invariant: s contains the values live at the start of b (excluding phi inputs)
- for i, e := range b.Preds {
- p := e.b
- t.clear()
- t.addAll(s.live[p.ID])
- t.addAll(live.contents())
- t.addAll(spillLive[p.ID])
- for _, v := range phis {
- a := v.Args[i]
- if s.values[a.ID].needSlot {
- t.add(a.ID)
- }
- if spill := s.values[a.ID].spill; spill != nil {
- //TODO: remove? Subsumed by SpillUse?
- t.add(spill.ID)
- }
- }
- if t.size() == len(s.live[p.ID]) {
- continue
- }
- // grow p's live set
- s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
- changed = true
- }
+ f := s.f
+ for _, b := range f.Blocks {
+ for _, spillvid := range spillLive[b.ID] {
+ val := &s.values[spillvid]
+ val.addUseBlock(b, true)
}
-
- if !changed {
- break
+ for _, v := range b.Values {
+ for i, a := range v.Args {
+ val := &s.values[a.ID]
+ useBlock := b
+ forceLiveout := false
+ if v.Op == OpPhi {
+ useBlock = b.Preds[i].b
+ forceLiveout = true
+ if spill := val.spill; spill != nil {
+ s.values[spill.ID].addUseBlock(useBlock, true)
+ }
+ }
+ if !val.needSlot {
+ continue
+ }
+ val.addUseBlock(useBlock, forceLiveout)
+ }
}
}
+
+ s.live = make([][]ID, f.NumBlocks())
+ push := func(bid, vid ID) {
+ l := s.live[bid]
+ if l == nil || l[len(l)-1] != vid {
+ l = append(l, vid)
+ }
+ s.live[bid] = l
+ }
+ // TODO: If we can help along the interference graph by calculating livein sets,
+ // we can do so trivially by turning this sparse set into an array of arrays
+ // and checking the top for the current value instead of inclusion in the sparse set.
+ seen := f.newSparseSet(f.NumBlocks())
+ defer f.retSparseSet(seen)
+ // instead of pruning out duplicate blocks when we build the useblocks
+ // or when we add them to the queue, rely on the seen set to stop considering
+ // them. This is slightly faster than building the workqueues as sets
+ //
+ // However, this means that the queue can grow larger than the number of blocks.
+ // hence, we cannot use allocBlockSlice.
+ bqueue := make([]*Block, 0, f.NumBlocks())
+ for vid, v := range s.values {
+ if !v.needSlot {
+ continue
+ }
+ seen.clear()
+ bqueue = bqueue[:0]
+ for _, b := range v.useBlocks {
+ if b.liveout {
+ push(b.b.ID, ID(vid))
+ }
+ bqueue = append(bqueue, b.b)
+ }
+ for len(bqueue) > 0 {
+ work := bqueue[len(bqueue)-1]
+ bqueue = bqueue[:len(bqueue)-1]
+ if seen.contains(work.ID) || work.ID == v.defBlock {
+ continue
+ }
+ seen.add(work.ID)
+ for _, e := range work.Preds {
+ push(e.b.ID, ID(vid))
+ bqueue = append(bqueue, e.b)
+ }
+ }
+ }
+
if s.f.pass.debug > stackDebug {
for _, b := range s.f.Blocks {
fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
// The set property is loosely enforced. We append a block if the"The set property" you mean here is the no duplicates property, you should make that clear.
// Because values using stack slots are few and far inbetweenThis comment should probably go inside the body, as it relates to the implementation (not needed by callers).
//TODO: remove? Subsumed by SpillUse?Keep this TODO?
s.live[bid] = lThis can go inside the `if`.
push(b.b.ID, ID(vid))This case means we might have duplicates in s.live, right?
That's new behavior, might be worth checking that that is still ok. (It probably is, we just call `addAll` on it?)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
// The set property is loosely enforced. We append a block if the"The set property" you mean here is the no duplicates property, you should make that clear.
Wrote a better comment 😊
// Because values using stack slots are few and far inbetweenThis comment should probably go inside the body, as it relates to the implementation (not needed by callers).
Done
//TODO: remove? Subsumed by SpillUse?Daniel MorsingKeep this TODO?
Done
This can go inside the `if`.
Done
This case means we might have duplicates in s.live, right?
That's new behavior, might be worth checking that that is still ok. (It probably is, we just call `addAll` on it?)
That's the neat thing about doing this value by value. Because we are iterating over the values in the outer loop, we either find the top of the stack for a given block to already have the value (the `l[len(l)-1] != vid` check in `push`) or we see that it's missing and push it.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Auto-Submit | +1 |
| Code-Review | +2 |
push(b.b.ID, ID(vid))Daniel MorsingThis case means we might have duplicates in s.live, right?
That's new behavior, might be worth checking that that is still ok. (It probably is, we just call `addAll` on it?)
That's the neat thing about doing this value by value. Because we are iterating over the values in the outer loop, we either find the top of the stack for a given block to already have the value (the `l[len(l)-1] != vid` check in `push`) or we see that it's missing and push it.
I see, there may be duplicate blocks in the queue, but we never get duplicate values in any block's list.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
cmd/compile: optimize liveness in stackalloc
The stackalloc code needs to run a liveness pass to build the
interference graph between stack slots. Because the values that we need
liveness over is so sparse, we can optimize the analysis by using a path
exploration algorithm rather than a iterative dataflow one
In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |