cmd/compile: use immediates for uint8/uint16 bitwise ops on PPC64
On PPC64, bitwise operations (AND, OR, XOR) on uint8/uint16 with
constants such as 1<<15 were materializing the constant with MOVD,
often as a negative value (e.g., MOVD $-32768). This led to extra
instructions compared to uint32/uint64 cases, which correctly used
immediate forms.
This CL adds latelower rewrite rules so that uint8/uint16 AND, OR, and XOR with
constants that fit in 16 bits (unsigned or sign-extended) are lowered
to immediate instructions (andi., ori, xori). The constant is always
zero-extended to match the result width (8 or 16 bits). Rules are
restricted to unsigned types to avoid sign-extension issues.
Also adds regression tests under test/codegen to ensure that these
operations use ANDCC/OR/XOR immediates and do not materialize constants
via MOVD.
diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
index 15e6f72..9438485 100644
--- a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
@@ -18,8 +18,21 @@
(SETBC [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [1] (MOVDconst [1]) cmp)
(SETBCR [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [5] (MOVDconst [1]) cmp)
-// The upper bits of the smaller than register values is undefined. Take advantage of that.
-(AND <t> x:(MOVDconst [m]) n) && t.Size() <= 2 => (ANDconst [int64(int16(m))] n)
+// Sub-word bitwise operations on PPC64 (uint8/uint16).
+// Use immediate forms (andi., ori, xori) for 16-bit constants instead of
+// materializing constants (e.g., -32768 for 1<<15). Constants must fit in
+// 16 bits (unsigned or sign-extended). Results are zero-extended to type
+// width (8 or 16 bits). Upper bits are undefined. Commute rules handle
+// left-constant cases (e.g., (1<<15) & v).
+
+(AND <t> x:(MOVDconst [m]) n)&& t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (Select0 (ANDCCconst [int64(uint64(m) & 0x00FF)] n))
+(AND <t> x:(MOVDconst [m]) n)&& t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (Select0 (ANDCCconst [int64(uint64(m) & 0xFFFF)] n))
+(OR <t> x:(MOVDconst [m]) n)&& t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (ORconst [int64(uint64(m) & 0x00FF)] n)
+(OR <t> x:(MOVDconst [m]) n)&& t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (ORconst [int64(uint64(m) & 0xFFFF)] n)
+(XOR <t> x:(MOVDconst [m]) n)&& t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (XORconst [int64(uint64(m) & 0x00FF)] n)
+(XOR <t> x:(MOVDconst [m]) n)&& t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )=> (XORconst [int64(uint64(m) & 0xFFFF)] n)
+
+
// Convert simple bit masks to an equivalent rldic[lr] if possible.
(AND x:(MOVDconst [m]) n) && isPPC64ValidShiftMask(m) => (RLDICL [encodePPC64RotateMask(0,m,64)] n)
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
index 18c0528..5bde605 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
@@ -3,6 +3,7 @@
package ssa
import "internal/buildcfg"
+import "cmd/compile/internal/types"
func rewriteValuePPC64latelower(v *Value) bool {
switch v.Op {
@@ -16,6 +17,8 @@
return rewriteValuePPC64latelower_OpPPC64CMPconst(v)
case OpPPC64ISEL:
return rewriteValuePPC64latelower_OpPPC64ISEL(v)
+ case OpPPC64OR:
+ return rewriteValuePPC64latelower_OpPPC64OR(v)
case OpPPC64RLDICL:
return rewriteValuePPC64latelower_OpPPC64RLDICL(v)
case OpPPC64RLDICLCC:
@@ -24,6 +27,8 @@
return rewriteValuePPC64latelower_OpPPC64SETBC(v)
case OpPPC64SETBCR:
return rewriteValuePPC64latelower_OpPPC64SETBCR(v)
+ case OpPPC64XOR:
+ return rewriteValuePPC64latelower_OpPPC64XOR(v)
}
return false
}
@@ -55,9 +60,11 @@
func rewriteValuePPC64latelower_OpPPC64AND(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
+ b := v.Block
+ typ := &b.Func.Config.Types
// match: (AND <t> x:(MOVDconst [m]) n)
- // cond: t.Size() <= 2
- // result: (ANDconst [int64(int16(m))] n)
+ // cond: t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (Select0 (ANDCCconst [int64(uint64(m) & 0x00FF)] n))
for {
t := v.Type
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
@@ -67,12 +74,38 @@
}
m := auxIntToInt64(x.AuxInt)
n := v_1
- if !(t.Size() <= 2) {
+ if !(t.Size() == 1 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
continue
}
- v.reset(OpPPC64ANDconst)
- v.AuxInt = int64ToAuxInt(int64(int16(m)))
- v.AddArg(n)
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpPPC64ANDCCconst, types.NewTuple(typ.Int, types.TypeFlags))
+ v0.AuxInt = int64ToAuxInt(int64(uint64(m) & 0x00FF))
+ v0.AddArg(n)
+ v.AddArg(v0)
+ return true
+ }
+ break
+ }
+ // match: (AND <t> x:(MOVDconst [m]) n)
+ // cond: t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (Select0 (ANDCCconst [int64(uint64(m) & 0xFFFF)] n))
+ for {
+ t := v.Type
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if x.Op != OpPPC64MOVDconst {
+ continue
+ }
+ m := auxIntToInt64(x.AuxInt)
+ n := v_1
+ if !(t.Size() == 2 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
+ continue
+ }
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpPPC64ANDCCconst, types.NewTuple(typ.Int, types.TypeFlags))
+ v0.AuxInt = int64ToAuxInt(int64(uint64(m) & 0xFFFF))
+ v0.AddArg(n)
+ v.AddArg(v0)
return true
}
break
@@ -656,6 +689,55 @@
}
return false
}
+func rewriteValuePPC64latelower_OpPPC64OR(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (OR <t> x:(MOVDconst [m]) n)
+ // cond: t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (ORconst [int64(uint64(m) & 0x00FF)] n)
+ for {
+ t := v.Type
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if x.Op != OpPPC64MOVDconst {
+ continue
+ }
+ m := auxIntToInt64(x.AuxInt)
+ n := v_1
+ if !(t.Size() == 1 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
+ continue
+ }
+ v.reset(OpPPC64ORconst)
+ v.AuxInt = int64ToAuxInt(int64(uint64(m) & 0x00FF))
+ v.AddArg(n)
+ return true
+ }
+ break
+ }
+ // match: (OR <t> x:(MOVDconst [m]) n)
+ // cond: t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (ORconst [int64(uint64(m) & 0xFFFF)] n)
+ for {
+ t := v.Type
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if x.Op != OpPPC64MOVDconst {
+ continue
+ }
+ m := auxIntToInt64(x.AuxInt)
+ n := v_1
+ if !(t.Size() == 2 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
+ continue
+ }
+ v.reset(OpPPC64ORconst)
+ v.AuxInt = int64ToAuxInt(int64(uint64(m) & 0xFFFF))
+ v.AddArg(n)
+ return true
+ }
+ break
+ }
+ return false
+}
func rewriteValuePPC64latelower_OpPPC64RLDICL(v *Value) bool {
v_0 := v.Args[0]
// match: (RLDICL [em] x:(SRDconst [s] a))
@@ -817,6 +899,55 @@
}
return false
}
+func rewriteValuePPC64latelower_OpPPC64XOR(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (XOR <t> x:(MOVDconst [m]) n)
+ // cond: t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (XORconst [int64(uint64(m) & 0x00FF)] n)
+ for {
+ t := v.Type
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if x.Op != OpPPC64MOVDconst {
+ continue
+ }
+ m := auxIntToInt64(x.AuxInt)
+ n := v_1
+ if !(t.Size() == 1 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
+ continue
+ }
+ v.reset(OpPPC64XORconst)
+ v.AuxInt = int64ToAuxInt(int64(uint64(m) & 0x00FF))
+ v.AddArg(n)
+ return true
+ }
+ break
+ }
+ // match: (XOR <t> x:(MOVDconst [m]) n)
+ // cond: t.Size() == 2 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m )
+ // result: (XORconst [int64(uint64(m) & 0xFFFF)] n)
+ for {
+ t := v.Type
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ x := v_0
+ if x.Op != OpPPC64MOVDconst {
+ continue
+ }
+ m := auxIntToInt64(x.AuxInt)
+ n := v_1
+ if !(t.Size() == 2 && ((uint64(m)&^0xFFFF) == 0 || int64(int16(m)) == m)) {
+ continue
+ }
+ v.reset(OpPPC64XORconst)
+ v.AuxInt = int64ToAuxInt(int64(uint64(m) & 0xFFFF))
+ v.AddArg(n)
+ return true
+ }
+ break
+ }
+ return false
+}
func rewriteBlockPPC64latelower(b *Block) bool {
return false
}
diff --git a/test/codegen/subword_andorxor.go b/test/codegen/subword_andorxor.go
new file mode 100644
index 0000000..c3c8e8e
--- /dev/null
+++ b/test/codegen/subword_andorxor.go
@@ -0,0 +1,59 @@
+// asmcheck
+
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Codegen test for PPC64 sub-word AND/OR/XOR immediates.
+
+package codegen
+
+//go:noinline
+func U16And(v uint16) uint16 {
+ // ppc64x:-"^MOVD\t\\$-?32768"
+ // ppc64x:"^ANDCC\t\\$32768, R[0-9]+, R[0-9]+"
+ return v & (1 << 15)
+}
+
+//go:noinline
+func U16Or(v uint16) uint16 {
+ // ppc64x:-"^MOVD\t\\$-?32768"
+ // ppc64x:"^(OR|ORI)\t\\$32768, R[0-9]+, R[0-9]+"
+ return v | (1 << 15)
+}
+
+//go:noinline
+func U16Xor(v uint16) uint16 {
+ // ppc64x:-"^MOVD\t\\$-?32768"
+ // ppc64x:"^(XOR|XORI)\t\\$32768, R[0-9]+, R[0-9]+"
+ return v ^ (1 << 15)
+}
+
+//go:noinline
+func U8And(v uint8) uint8 {
+ // ppc64x:-"^MOVD\t\\$-?128"
+ // ppc64x:"^ANDCC\t\\$128, R[0-9]+, R[0-9]+"
+ return v & (1 << 7)
+}
+
+//go:noinline
+func U8Or(v uint8) uint8 {
+ // ppc64x:-"^MOVD\t\\$-?128"
+ // ppc64x:"^(OR|ORI)\t\\$128, R[0-9]+, R[0-9]+"
+ return v | (1 << 7)
+}
+
+//go:noinline
+func U8Xor(v uint8) uint8 {
+ // ppc64x:-"^MOVD\t\\$-?128"
+ // ppc64x:"^(XOR|XORI)\t\\$128, R[0-9]+, R[0-9]+"
+ return v ^ (1 << 7)
+}
+
+// --- 32-bit AND sanity (ANDCC with limm16 is still expected) ---
+
+//go:noinline
+func U32And(v uint32) uint32 {
+ // ppc64x:-"^MOVD\t\\$-?32768"
+ // ppc64x:"^ANDCC\t\\$32768, R[0-9]+, R[0-9]+"
+ return v & (1 << 15)
+}
\ No newline at end of file
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Run-TryBot | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
| Run-TryBot | +0 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
(AND <t> x:(MOVDconst [m]) n)&& t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (Select0 (ANDCCconst [int64(uint64(m) & 0x00FF)] n))I think algebraic rules should be in the first lowering pass. latelower is intended for rules which require (or are substantially simplified) by running the first lower pass to completion.
As noted, smaller-than-register types are sign extended. Likewise, those upper x bits are generally ignored. However, the lowering rules also merge explicit sign/zero extensions into more complex operations. Thus, a bit of care and testing is needed when testing.
Why not something like `(MOVDconst t [x] && t.size() == 2 && uint64(x) > 0xFFFF) => (MOVDconst t [x&0xFFFF])` and let the existing rules run?
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
// ppc64x:"^(OR|ORI)\t\\$32768, R[0-9]+, R[0-9]+"Are the immediate form instructions not consistently decoding in OR or ORI? Likewise for XOR.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
| Commit-Queue | +1 |
(AND <t> x:(MOVDconst [m]) n)&& t.Size() == 1 && ( (uint64(m) &^ 0xFFFF) == 0 || int64(int16(m)) == m ) => (Select0 (ANDCCconst [int64(uint64(m) & 0x00FF)] n))I think algebraic rules should be in the first lowering pass. latelower is intended for rules which require (or are substantially simplified) by running the first lower pass to completion.
As noted, smaller-than-register types are sign extended. Likewise, those upper x bits are generally ignored. However, the lowering rules also merge explicit sign/zero extensions into more complex operations. Thus, a bit of care and testing is needed when testing.
Why not something like `(MOVDconst t [x] && t.size() == 2 && uint64(x) > 0xFFFF) => (MOVDconst t [x&0xFFFF])` and let the existing rules run?
Thanks for the suggestion! I agree this is an algebraic cleanup better suited to the first lowering pass.
However, applying a blanket rule like the above, could alter semantics for signed 16-bit constants (e.g., int16(-1) → 65535), since the first lowering also merges sign and zero extensions. I plan to move this logic to an earlier pass but restrict normalization to unsigned small types (uint8/uint16), ensuring correctness while still allowing existing ANDCCconst, ORconst, and XORconst rules to pick up the folded form naturally.
The existing codegen test (subword_andorxor.go) will continue to verify that no MOVD $-32768 or $-128 sequences are emitted.
Are the immediate form instructions not consistently decoding in OR or ORI? Likewise for XOR.
Yes — the issue affected all three bitwise ops (AND, OR, XOR) on uint8/uint16.
In each case, constants like 1<<15 or 1<<7 were materialized via MOVD (e.g., $-32768 or $-128) instead of using the immediate forms (andi., ori, xori).
The fix normalizes unsigned 8/16-bit constants early so all three ops now correctly use immediate forms.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
Jayanth KrishnamurthyTryBots beginning. Status page: https://farmer.golang.org/try?commit=1922ced0
Acknowledged
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
@jayanth.kr...@ibm.com have you run any codegen tests to see how this changes/improves the instruction counts?
(MOVDconst <t> [x]) && t.Size() == 1 && t.IsUnsigned() && (uint64(x) &^ 0xFF) != 0 => (MOVDconst <t> [x & 0xFF])Is there any difference to codegen if you were to further specialize the following rule instead?
```(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])```
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
@jayanth.kr...@ibm.com have you run any codegen tests to see how this changes/improves the instruction counts?
Yes — I verified codegen before and after the change using both the new targeted test and standalone disassembly. Before (current tip, without normalization):
uint8/uint16 OR/XOR emitted two instructions, e.g. MOVD $-32768, R4
OR/XOR R4, R3, R3. (likewise $-128 for 8-bit). uint8/uint16 AND sometimes also materialized a MOVD constant. After (with this CL):
All three ops fold to single-instruction immediate forms:
ANDCC $32768, R3, R3
OR $32768, R3, R3
XOR $32768, R3, R3
ANDCC $128, R3, R3
OR $128, R3, R3
XOR $128, R3, R3
No MOVD $-32768 or $-128 appear anywhere.
Instruction count reduced by one per affected operation.
(MOVDconst <t> [x]) && t.Size() == 1 && t.IsUnsigned() && (uint64(x) &^ 0xFF) != 0 => (MOVDconst <t> [x & 0xFF])Is there any difference to codegen if you were to further specialize the following rule instead?
```(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])```
Thanks, that’s a good point — I looked into that path.
The suggested rule may move all constant forms into 64-bit space early, without regard to type width or signedness. If we specialize it to truncate val for unsigned 8/16-bit types, that may silently reinterpret constants (e.g., uint16(0x8000) → 0x8000, but int16(-1) → 0xFFFF), which may break signed semantics and interact with zero/sign-extension merges later in lowering.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
(MOVDconst <t> [x]) && t.Size() == 1 && t.IsUnsigned() && (uint64(x) &^ 0xFF) != 0 => (MOVDconst <t> [x & 0xFF])Jayanth KrishnamurthyIs there any difference to codegen if you were to further specialize the following rule instead?
```(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])```
Thanks, that’s a good point — I looked into that path.
The suggested rule may move all constant forms into 64-bit space early, without regard to type width or signedness. If we specialize it to truncate val for unsigned 8/16-bit types, that may silently reinterpret constants (e.g., uint16(0x8000) → 0x8000, but int16(-1) → 0xFFFF), which may break signed semantics and interact with zero/sign-extension merges later in lowering.
Marked as resolved.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |
This is a change I had
(MOVDconst <t> [x]) && t.Size() == 1 && t.IsUnsigned() && (uint64(x) &^ 0xFF) != 0 => (MOVDconst <t> [x & 0xFF])Jayanth KrishnamurthyIs there any difference to codegen if you were to further specialize the following rule instead?
```(Const(64|32|16|8) [val]) => (MOVDconst [int64(val)])```
Jayanth KrishnamurthyThanks, that’s a good point — I looked into that path.
The suggested rule may move all constant forms into 64-bit space early, without regard to type width or signedness. If we specialize it to truncate val for unsigned 8/16-bit types, that may silently reinterpret constants (e.g., uint16(0x8000) → 0x8000, but int16(-1) → 0xFFFF), which may break signed semantics and interact with zero/sign-extension merges later in lowering.
Marked as resolved.
Note, this will regress codegen for SI constant using ops, like addi and subfic. Values >= 0x8000 will into split two instructions.
I think this should be more surgical. Only simplify those instructions which use a UI (e.g xori/ori/andi) constant.
// asmcheckI'm not sure these tests belong in a new file. They could be placed in `arithmetic.go` or `bits.go`.
| Inspect html for hidden footers to help with email filtering. To unsubscribe visit settings. |