cmd/compile: forward small Load through Move to avoid redundant copies (issue #77720)
Add a generic SSA rewrite that forwards `Load` from a `Move` destination
back to the `Move` source when it is provably safe, so field reads like
`s.h.Value().typ` don’t force a full struct copy.
- Add `Load <- Move` rewrite in `generic.rules` with safety guards:
in-range offset, `CanSSA` load type, non-volatile source, same block,
alias safety (`disjoint` / `isInlinableMemmove` / non-address-taken local addr).
- Limit rewrite to `n < 1<<30` so huge moves still trigger existing
large-stack-frame diagnostics (keeps `fixedbugs/issue22200*` behavior).
- Add `isLocalAddrNoAlias` helper in `rewrite.go`.
- Regenerate `rewritegeneric.go`.
- Add `test/codegen/moveload.go` to assert no `MOVUPS` and direct `MOVBLZX`
in both direct and inlined forms.
diff --git a/src/cmd/compile/internal/ssa/_gen/generic.rules b/src/cmd/compile/internal/ssa/_gen/generic.rules
index 495a8d9..55faec2 100644
--- a/src/cmd/compile/internal/ssa/_gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/_gen/generic.rules
@@ -796,6 +796,16 @@
&& disjoint(p5, t5.Size(), p4, t4.Size())
=> x
+// Load from a region just copied by Move can read directly from the source.
+// Keep this conservative: only if we can prove overlap safety for Move semantics.
+(Load <t1> op:(OffPtr [o1] p1) move:(Move [n] p2 src mem))
+ && o1 >= 0 && o1+t1.Size() <= n && n < 1<<30 && isSamePtr(p1, p2)
+ && CanSSA(t1)
+ && !isVolatile(src)
+ && src.Block == mem.Block
+ && (disjoint(src, n, p2, n) || isInlinableMemmove(p2, src, n, config) || isLocalAddrNoAlias(p2))
+ => @mem.Block (Load <t1> (OffPtr <op.Type> [o1] src) mem)
+
// Pass constants through math.Float{32,64}bits and math.Float{32,64}frombits
(Load <t1> p1 (Store {t2} p2 (Const64 [x]) _)) && isSamePtr(p1,p2) && t2.Size() == 8 && is64BitFloat(t1) && !math.IsNaN(math.Float64frombits(uint64(x))) => (Const64F [math.Float64frombits(uint64(x))])
(Load <t1> p1 (Store {t2} p2 (Const32 [x]) _)) && isSamePtr(p1,p2) && t2.Size() == 4 && is32BitFloat(t1) && !math.IsNaN(float64(math.Float32frombits(uint32(x)))) => (Const32F [math.Float32frombits(uint32(x))])
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index b093081..1699b25 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -882,6 +882,19 @@
return v.Op == OpSP || v.Op == OpLocalAddr
}
+// isLocalAddrNoAlias reports whether v points into a non-address-taken local stack slot.
+// Such slots cannot be referenced by ordinary source-level pointers.
+func isLocalAddrNoAlias(v *Value) bool {
+ for v.Op == OpOffPtr {
+ v = v.Args[0]
+ }
+ if v.Op != OpLocalAddr {
+ return false
+ }
+ n, ok := v.Aux.(*ir.Name)
+ return ok && n.OnStack() && !n.Addrtaken()
+}
+
// disjoint reports whether the memory region specified by [p1:p1+n1)
// does not overlap with [p2:p2+n2).
// A return value of false does not imply the regions overlap.
diff --git a/src/cmd/compile/internal/ssa/rewritegeneric.go b/src/cmd/compile/internal/ssa/rewritegeneric.go
index 4e8f703..69ff84b 100644
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
@@ -11987,6 +11987,37 @@
v.copyOf(x)
return true
}
+ // match: (Load <t1> op:(OffPtr [o1] p1) move:(Move [n] p2 src mem))
+ // cond: o1 >= 0 && o1+t1.Size() <= n && n < 1<<30 && isSamePtr(p1, p2) && CanSSA(t1) && !isVolatile(src) && src.Block == mem.Block && (disjoint(src, n, p2, n) || isInlinableMemmove(p2, src, n, config) || isLocalAddrNoAlias(p2))
+ // result: @mem.Block (Load <t1> (OffPtr <op.Type> [o1] src) mem)
+ for {
+ t1 := v.Type
+ op := v_0
+ if op.Op != OpOffPtr {
+ break
+ }
+ o1 := auxIntToInt64(op.AuxInt)
+ p1 := op.Args[0]
+ move := v_1
+ if move.Op != OpMove {
+ break
+ }
+ n := auxIntToInt64(move.AuxInt)
+ mem := move.Args[2]
+ p2 := move.Args[0]
+ src := move.Args[1]
+ if !(o1 >= 0 && o1+t1.Size() <= n && n < 1<<30 && isSamePtr(p1, p2) && CanSSA(t1) && !isVolatile(src) && src.Block == mem.Block && (disjoint(src, n, p2, n) || isInlinableMemmove(p2, src, n, config) || isLocalAddrNoAlias(p2))) {
+ break
+ }
+ b = mem.Block
+ v0 := b.NewValue0(v.Pos, OpLoad, t1)
+ v.copyOf(v0)
+ v1 := b.NewValue0(v.Pos, OpOffPtr, op.Type)
+ v1.AuxInt = int64ToAuxInt(o1)
+ v1.AddArg(src)
+ v0.AddArg2(v1, mem)
+ return true
+ }
// match: (Load <t1> p1 (Store {t2} p2 (Const64 [x]) _))
// cond: isSamePtr(p1,p2) && t2.Size() == 8 && is64BitFloat(t1) && !math.IsNaN(math.Float64frombits(uint64(x)))
// result: (Const64F [math.Float64frombits(uint64(x))])
diff --git a/test/codegen/moveload.go b/test/codegen/moveload.go
new file mode 100644
index 0000000..d24d410
--- /dev/null
+++ b/test/codegen/moveload.go
@@ -0,0 +1,38 @@
+// asmcheck
+
+// Copyright 2026 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package codegen
+
+// From issue #77720: cmd/compile: field access on struct-returning method copies entire struct
+
+type moveLoadBig struct {
+ typ int8
+ index int64
+ str string
+ pkgID string
+}
+
+type moveLoadHandle[T any] struct {
+ value *T
+}
+
+func (h moveLoadHandle[T]) Value() T { return *h.value }
+
+type moveLoadS struct {
+ h moveLoadHandle[moveLoadBig]
+}
+
+func moveLoadFieldViaValue(s moveLoadS) int8 {
+ // amd64:-`MOVUPS`
+ // amd64:`MOVBLZX`
+ return s.h.Value().typ
+}
+
+func moveLoadFieldViaValueInline(ss []moveLoadS, i int) int8 {
+ // amd64:-`MOVUPS`
+ // amd64:`MOVBLZX`
+ return ss[i&7].h.Value().typ
+}
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Congratulations on opening your first change. Thank you for your contribution!
Next steps:
A maintainer will review your change and provide feedback. See
https://go.dev/doc/contribute#review for more info and tips to get your
patch through code review.
Most changes in the Go project go through a few rounds of revision. This can be
surprising to people new to the project. The careful, iterative review process
is our way of helping mentor contributors and ensuring that their contributions
have a lasting impact.
During May-July and Nov-Jan the Go project is in a code freeze, during which
little code gets reviewed or merged. If a reviewer responds with a comment like
R=go1.11 or adds a tag like "wait-release", it means that this CL will be
reviewed as part of the next development cycle. See https://go.dev/s/release
for more details.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
cmd/compile: forward small Load through Move to avoid redundant copies (issue #77720)Make this a separate line in the commit message body, `Fixes #77720` (or `Update #77720`, if it isn't a complete fix).
- Limit rewrite to `n < 1<<30` so huge moves still trigger existingIt might be better to fix the tests here. If they aren't really requiring a full copy we should fix that.
(Maybe sum up the elements instead of returning the first one?)
BenchmarkTypViaValue-20 ~0.46 ns/opI'm skeptical in general of sub-ns benchmark times. They often are measuring the benchmark driver itself, or random alignment issues.
A reasonably large copy (1KB?) should provide a much more obvious improvement.
Speaking of which, the benchmarks should be part of this CL.
&& o1 >= 0 && o1+t1.Size() <= n && n < 1<<30 && isSamePtr(p1, p2)See my CL description comment
&& CanSSA(t1)I don't think this is necessary? The fact that we have a Load op for it means it is already known to be SSA-able.
&& src.Block == mem.BlockThis seems unnecessary. Where the source pointer is computed doesn't seem relevant to me. (We know it must be before the Move, and that is good enough.)
&& (disjoint(src, n, p2, n) || isInlinableMemmove(p2, src, n, config) || isLocalAddrNoAlias(p2))All of these also seem unnecessary to me. Move is spec'd to be like memmove in that the result is a copy of the source, even if the source and dest overlap.
The fact that the Load takes the Move memory value directly means no other operation (even if aliased) can change the memory between the memory states `mem` and `move`.
=> @mem.Block (Load <t1> (OffPtr <op.Type> [o1] src) mem)This I think might be wrong. We need to put the load where `src` and `mem` are available. That is `move.Block`, not `mem.Block`.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |