cmd/compile: improve stp merging for non-sequent cases
Original algorithm merges stores with the first
mergeable store in the chain, but it misses some
cases. Additionally, creating list of STs, which
store data to adjacent memory cells allows merging them
according to the direction of increase of their addresses.
I have already tried another algorithm in CL 698097,
but it was reverted. This algorithm works differently
and fixes bug, generated by variant from another CL.
Fixes #71987, #75365
There are the results of sweet benchmarks
│ base.stat │ opt.stat │
│ sec/op │ sec/op vs base │
ESBuildThreeJS-4 1.088 ± 2% 1.086 ± 1% ~ (p=1.000 n=10)
ESBuildRomeTS-4 263.0m ± 2% 260.8m ± 1% ~ (p=0.105 n=10)
EtcdPut-4 73.08m ± 1% 73.16m ± 1% ~ (p=0.971 n=10)
EtcdSTM-4 414.9m ± 1% 415.4m ± 1% ~ (p=0.393 n=10)
GoBuildKubelet-4 203.3 ± 0% 203.5 ± 0% ~ (p=0.393 n=10)
GoBuildKubeletLink-4 19.06 ± 1% 19.05 ± 0% ~ (p=0.280 n=10)
GoBuildIstioctl-4 156.6 ± 0% 156.6 ± 0% ~ (p=0.796 n=10)
GoBuildIstioctlLink-4 14.16 ± 1% 14.18 ± 1% ~ (p=0.853 n=10)
GoBuildFrontend-4 56.45 ± 1% 56.57 ± 0% ~ (p=0.579 n=10)
GoBuildFrontendLink-4 3.635 ± 1% 3.646 ± 0% ~ (p=0.436 n=10)
GoBuildTsgo-4 103.0 ± 1% 103.4 ± 1% ~ (p=0.529 n=10)
GoBuildTsgoLink-4 1.865 ± 1% 1.860 ± 1% ~ (p=0.684 n=10)
GopherLuaKNucleotide-4 33.55 ± 0% 33.58 ± 0% ~ (p=0.075 n=10)
MarkdownRenderXHTML-4 281.0m ± 0% 280.3m ± 0% -0.23% (p=0.019 n=10)
Tile38QueryLoad-4 970.0µ ± 1% 969.3µ ± 0% ~ (p=0.436 n=10)
geomean 3.128 3.128 -0.01%
diff --git a/src/cmd/compile/internal/ssa/pair.go b/src/cmd/compile/internal/ssa/pair.go
index 7595fdb..6c1e074 100644
--- a/src/cmd/compile/internal/ssa/pair.go
+++ b/src/cmd/compile/internal/ssa/pair.go
@@ -320,10 +320,28 @@
return false
}
+type storeList struct {
+ st *Value
+ n int
+ // Index of ST, which store data to
+ // the adjacent memory cell with higher address
+ start bool
+ // Flag if it is start store in chain of STs
+ early bool
+ // Flag if this store is placed earlier
+ // in basic block than next ST (stores[elem.n].st)
+}
+
+const invalidVal = -1
+
func pairStores(f *Func) {
last := f.Cache.allocBoolSlice(f.NumValues())
defer f.Cache.freeBoolSlice(last)
+ var stores []storeList
+ // Index of st in stores
+ var ids = map[ID]int{}
+
// prevStore returns the previous store in the
// same block, or nil if there are none.
prevStore := func(v *Value) *Value {
@@ -338,6 +356,8 @@
}
for _, b := range f.Blocks {
+ stores = stores[:0]
+ clear(ids)
// Find last store in block, so we can
// walk the stores last to first.
// Last to first helps ensure that the rewrites we
@@ -373,14 +393,20 @@
continue // Not advisable to pair.
}
ptr := v.Args[0]
- val := v.Args[1]
- mem := v.Args[2]
off := v.AuxInt
aux := v.Aux
+ var ok bool
+ var vIdx int
+ if vIdx, ok = ids[v.ID]; !ok {
+ vIdx = len(stores)
+ ids[v.ID] = vIdx
+ stores = append(stores, storeList{v, invalidVal, true, true})
+ }
+
// Look for earlier store we can combine with.
- lowerOk := true
- higherOk := true
+ lowerOk := stores[vIdx].start
+ higherOk := stores[vIdx].n == invalidVal
count := 10 // max lookback distance
for w := prevStore(v); w != nil; w = prevStore(w) {
if w.Uses != 1 {
@@ -391,32 +417,7 @@
// all those other uses.)
continue memCheck
}
- if w.Op == v.Op &&
- w.Args[0] == ptr &&
- w.Aux == aux &&
- (lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
- // This op is mergeable with v.
- // Commit point.
-
- // ptr val1 val2 mem
- args := []*Value{ptr, val, w.Args[1], mem}
- if w.AuxInt == off-info.width {
- args[1], args[2] = args[2], args[1]
- off -= info.width
- }
- v.reset(info.pair)
- v.AddArgs(args...)
- v.Aux = aux
- v.AuxInt = off
- v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
-
- // Make w just a memory copy.
- wmem := w.MemoryArg()
- w.reset(OpCopy)
- w.SetArgs1(wmem)
- continue memCheck
- }
if count--; count == 0 {
// Only look back so far.
// This keeps us in O(n) territory, and it
@@ -425,6 +426,38 @@
// needing to spill them).
continue memCheck
}
+
+ var wIdx int
+ if wIdx, ok = ids[w.ID]; !ok {
+ wIdx = len(stores)
+ ids[w.ID] = wIdx
+ stores = append(stores, storeList{w, invalidVal, true, true})
+ }
+
+ if w.Op == v.Op &&
+ w.Args[0] == ptr &&
+ w.Aux == aux &&
+ (lowerOk && w.AuxInt == off-info.width &&
+ stores[wIdx].n == invalidVal ||
+ higherOk && w.AuxInt == off+info.width &&
+ stores[wIdx].start) {
+ // This op is mergeable with v.
+
+ if lowerOk && w.AuxInt == off-info.width &&
+ stores[wIdx].n == invalidVal {
+ stores[vIdx].start = false
+ stores[wIdx].n = vIdx
+ stores[wIdx].early = true
+ lowerOk = false
+ } else {
+ stores[wIdx].start = false
+ stores[vIdx].n = wIdx
+ stores[vIdx].early = false
+ higherOk = false
+ }
+
+ continue
+ }
// We're now looking at a store w which is currently
// between the store v that we're intending to merge into,
// and the store we'll eventually find to merge with it.
@@ -467,5 +500,62 @@
}
}
}
+
+ for i := range stores {
+ elem := &stores[i]
+ // Start pairing from the end of chain of stores
+ if elem.n != invalidVal && elem.start {
+ pairCompatibleStores(elem, stores[:])
+ }
+ }
+ }
+}
+
+func pairCompatibleStores(elem *storeList, stores []storeList) {
+ for {
+ // Mark elem prev as invalid and get w
+ if elem.n == invalidVal {
+ break
+ }
+ v := elem.st
+ w := stores[elem.n].st
+
+ mem := v.MemoryArg()
+ off := v.AuxInt
+ aux := v.Aux
+ wmem := w.MemoryArg()
+
+ // ptr val1 val2 mem
+ args := []*Value{v.Args[0], v.Args[1], w.Args[1], mem}
+
+ // Pair stores to the latest of them
+ if elem.early {
+ v, w = w, v
+ args[3] = wmem
+ mem, wmem = wmem, mem
+ }
+
+ v.reset(pairableStores[v.Op].pair)
+ v.AddArgs(args...)
+ v.Aux = aux
+ v.AuxInt = off
+ v.Pos = w.Pos
+
+ // Make w just a memory copy.
+ w.reset(OpCopy)
+ w.SetArgs1(wmem)
+
+ // Mark this stores as invalid
+ // and get pointer to the next store
+ nextSt := stores[elem.n].n
+ stores[elem.n].n = invalidVal
+ elem.n = invalidVal
+
+ if nextSt == invalidVal {
+ // There is end of chain of stores
+ break
+ }
+ // Get pointer to the next store
+ elem = &stores[nextSt]
}
}
diff --git a/test/codegen/memcombine.go b/test/codegen/memcombine.go
index 6feef26..1bb3133 100644
--- a/test/codegen/memcombine.go
+++ b/test/codegen/memcombine.go
@@ -1062,17 +1062,44 @@
}
func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
- // This is not perfect. We merge b+a, then d+e, then c and f have no pair.
+ // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 16\(R[0-9]+\)`
p.c = c
+ // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 32\(R[0-9]+\)`
p.f = f
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
p.a = a
- // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
p.e = e
p.d = d
p.b = b
}
+func dwstoreBigNil(p *struct{ i, j struct{ a, b, c int } }) {
+ // arm64:`STP\s\(ZR, ZR\), 32\(R[0-9]+\)`
+ // arm64:`STP\s\(ZR, ZR\), 16\(R[0-9]+\)`
+ p.j = struct{ a, b, c int }{}
+ // arm64:`STP\s\(ZR, ZR\), \(R[0-9]+\)`
+ p.i = struct{ a, b, c int }{}
+}
+
+func dwstoreString(s *struct {
+ p *byte
+ a string
+ b string
+ c int64
+ d int64
+}) {
+ s.p = nil
+ s.a = "foo"
+ s.b = "foo"
+ s.c = 0
+ s.d = 0
+ // arm64:"STP "
+ s.a = ""
+ // arm64:"STP "
+ s.b = "bar"
+ s.c = 33
+}
+
func dwstoreRet() [2]int {
// arm64:"STP "
return [2]int{5, 6}
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
I'm afraid I can't really follow what is going on in this CL. It needs an expanded comment somewhere describing how it works. What is the algorithm for finding pairs of stores to merge?
What does it mean to "start" a run? What does "early" mean, and why does it matter?
// Index of ST, which store data toGenerally we put the comment describing a field before the field, not after.
func pairCompatibleStores(elem *storeList, stores []storeList) {This function could use a doc.
What input is it expecting? What side effect does it do?
// Mark elem prev as invalid and get wI don't understand this comment.
func dwstoreString(s *struct {This test is fine, but it's just a test that the optimization was applied. This isn't a correctness test (and isn't the place for such things).
We really need a test in test/fixedbugs/issue75365.go that makes sure that bug does not recur with this CL.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
I'm afraid I can't really follow what is going on in this CL. It needs an expanded comment somewhere describing how it works. What is the algorithm for finding pairs of stores to merge?
What does it mean to "start" a run? What does "early" mean, and why does it matter?
I added more comments in code to clarify my algorithm, but if it is not enough, please, tell me.
Please, check new comments. I made them expanded.
Generally we put the comment describing a field before the field, not after.
Done
Generally we put the comment describing a field before the field, not after.
Done
func pairCompatibleStores(elem *storeList, stores []storeList) {This function could use a doc.
What input is it expecting? What side effect does it do?
Done
// Mark elem prev as invalid and get wI don't understand this comment.
I rewrote comments for this part in a better way.
// Mark elem prev as invalid and get wI don't understand this comment.
I rewrote comments for this part in a better way.
This test is fine, but it's just a test that the optimization was applied. This isn't a correctness test (and isn't the place for such things).
We really need a test in test/fixedbugs/issue75365.go that makes sure that bug does not recur with this CL.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
continue memCheckThis part here makes me really nervous. I think we need to throw everything away when we see such an operation? Or at least flush the possible pairings we have encountered so far and reset to a clean state. Otherwise we'll end up moving stores past loads, which is bad.
I have a different suggestion that might be more correct, and just as importantly, be much clearer. How about
```
// memChain contains a list of stores with the same ptr/aux pair and
// nonoverlapping write ranges [AuxInt:AuxInt+writeSize]. All of the
// elements of memChain can be reordered with each other.
memChain := []*Value{}
// writeRanges contains the union of write ranges of entries in memChain.
writeRanges := shadowRanges{} // from deadstore.go.
flushMemChain := func() {
// 1. sort memChain by AuxInt
// 2. walk through and pair up
memChain = memChain[:0]
}
for _, b := range f.Blocks {
clear(memChain)
for v := lastMem; v != nil; v = prevStore(v) {
if v.Uses != 1 {
flushMemChain()
continue
}
if len(memChain)>0 && (v.Args[0] != memChain[0].Args[0] || v.Aux != memChain[0].Aux) {
flushMemChain()
continue
}
if writeRanges.overlaps(v.AuxInt, v.AuxInt+writeSize) {
flushMemChain()
continue
}
memChain = append(memChain, v)
writeRanges.add(v.AuxInt, v.AuxInt+writeSize)
}
flushMemChain()
}
```
Now I know exactly what the invariant we are maintaining is. Everything in memChain is reorderable with each other. Any time we encounter a problem, we flush and restart.
// runThis file needs a copyright header.
(Just copy an existing one from this dir and set the year to 2026.)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Denis MelnikovI'm afraid I can't really follow what is going on in this CL. It needs an expanded comment somewhere describing how it works. What is the algorithm for finding pairs of stores to merge?
What does it mean to "start" a run? What does "early" mean, and why does it matter?
I added more comments in code to clarify my algorithm, but if it is not enough, please, tell me.
Done
Change algorithm according to comments
It is a good variant to solve it. Thank you. Added this algorithm
// Mark elem prev as invalid and get wDenis MelnikovI don't understand this comment.
I rewrote comments for this part in a better way.
Done
This file needs a copyright header.
(Just copy an existing one from this dir and set the year to 2026.)
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
type shadowRange struct {Maybe add a comment here that the range is inclusive on the bottom end and exclusive on the top end (like Go slices).
if lo <= int64(r.hi) && hi >= int64(r.lo) {I think these tests should not include equality.
By "overlapped", I presume you mean "has at least one byte in common". In which case, when these are equal, the ranges abut but do not overlap.
if lo < 0 || hi > 0xffff || len(sr.ranges) >= maxShadowRanges {
return
}This worries me. `shadowRanges` as used by deadstore can always respond "not shadowed" (not overlapping). It approximates the exact set of ranges with a *smaller* set.
I think in your use case we need the opposite. The safe return value is "always overlapping". If `shadowRanges` needs to approximate, it needs to use a *larger* approximation to be safe.
writeRanges = shadowRanges{}This might be more efficient as `writeRanges.ranges = writeRanges.ranges[:0]` to reuse the backing store.
Or maybe add that as a `reset` method to `shadowRanges`.
// Sort to put pairable stores together.in decreasing AuxInt
for i := len(memChain) - 1; i > 0; i-- {Seems a bit weird to sort decreasing and then walk backwards. Can we just sort increasing and walk forwards?
// or 0 if it is not a storenot a store this pass understands.
clear(memChain)Probably you mean `memChain = memChain[:0]`? `clear` doesn't reset the length to 0, it just zeros the elements.
// Cannot merge stores with multiple uses except the first store in the chain.last
if writeRanges.overlaps(v.AuxInt, v.AuxInt+writeSize-1) {Ah, you're using both-inclusive bounds here. I guess maybe that works with the current implementation of `shadowRanges`? But that's not how it is intended to be used. In particular, I don't think range merging will happen.
Probably best to use it the same way as deadstore does, with inclusive min and exclusive max.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
if writeRanges.overlaps(v.AuxInt, v.AuxInt+writeSize-1) {Ah, you're using both-inclusive bounds here. I guess maybe that works with the current implementation of `shadowRanges`? But that's not how it is intended to be used. In particular, I don't think range merging will happen.
Probably best to use it the same way as deadstore does, with inclusive min and exclusive max.
Or, just don't use shadowStore and walk the set of stores in memChain every time. It shouldn't be that expensive, and we can bound the length of the chain to a reasonable constant (100?) to make it not O(n^2).
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Leave shadowRanges. Walk through memChain to check overlapping of stores' locations nstead of it. Change walking forwards instead of backwards in flushMemChain.
Maybe add a comment here that the range is inclusive on the bottom end and exclusive on the top end (like Go slices).
Done
I think these tests should not include equality.
By "overlapped", I presume you mean "has at least one byte in common". In which case, when these are equal, the ranges abut but do not overlap.
Done
if lo < 0 || hi > 0xffff || len(sr.ranges) >= maxShadowRanges {
return
}This worries me. `shadowRanges` as used by deadstore can always respond "not shadowed" (not overlapping). It approximates the exact set of ranges with a *smaller* set.
I think in your use case we need the opposite. The safe return value is "always overlapping". If `shadowRanges` needs to approximate, it needs to use a *larger* approximation to be safe.
Done
// pairStores merges ST instructions.Denis Melnikovstore
Done
This might be more efficient as `writeRanges.ranges = writeRanges.ranges[:0]` to reuse the backing store.
Or maybe add that as a `reset` method to `shadowRanges`.
Done
// Sort to put pairable stores together.Denis Melnikovin decreasing AuxInt
Done
Seems a bit weird to sort decreasing and then walk backwards. Can we just sort increasing and walk forwards?
I did it to count len of memChain only one time.
Seems a bit weird to sort decreasing and then walk backwards. Can we just sort increasing and walk forwards?
I did it to count len of memChain only one time.
writeRanges = shadowRanges{}Denis Melnikovsame here
Done
not a store this pass understands.
Done
Probably you mean `memChain = memChain[:0]`? `clear` doesn't reset the length to 0, it just zeros the elements.
Done
// Cannot merge stores with multiple uses except the first store in the chain.Denis Melnikovlast
Done
if writeRanges.overlaps(v.AuxInt, v.AuxInt+writeSize-1) {Keith RandallAh, you're using both-inclusive bounds here. I guess maybe that works with the current implementation of `shadowRanges`? But that's not how it is intended to be used. In particular, I don't think range merging will happen.
Probably best to use it the same way as deadstore does, with inclusive min and exclusive max.
Or, just don't use shadowStore and walk the set of stores in memChain every time. It shouldn't be that expensive, and we can bound the length of the chain to a reasonable constant (100?) to make it not O(n^2).
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Code-Review | +2 |
| Commit-Queue | +1 |
if v.Uses != 1 && len(memChain) > 0 || writeSize == 0 {I think in the `v.Uses != 1` case, we don't need to `continue`, we could start a new memchain. So maybe these two cases should be separated?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Change behavior when v.Uses != 1 && len(memChain) > 0. Merge 3 conditions to make it branchless.
if v.Uses != 1 && len(memChain) > 0 || writeSize == 0 {I think in the `v.Uses != 1` case, we don't need to `continue`, we could start a new memchain. So maybe these two cases should be separated?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Add Mark Freeman for getting comments or +1 from one more maintainer. Mark put +1 on my previous patch of this pass.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |