Yesterday i needed the following unsupported SSE instructions to speed up a hotspot:
I was able to get them to work using the BYTE directive, but it's so cumbersome, and the resulting code isn't as clear as it could be. Also, experimentation is much hindered by the 'cast in stone' defines. Maybe i am doing things wrongly? Here's an example of my code:
// LDOU aka LDDQU - 11110010:00001111:11110000: modA xmmreg r/m
#define LDOU_AX_X0 BYTE $0xF2; BYTE $0x0F; BYTE $0xF0; BYTE $0x00;
#define LDOU_BX_X1 BYTE $0xF2; BYTE $0x0F; BYTE $0xF0; BYTE $0x0B;
// PSLLO ala PSLLDQ - 01100110:00001111:01110011:11111 xmmreg: imm8
#define PSLLO_X0_7 BYTE $0x66; BYTE $0x0F; BYTE $0x73; BYTE $0xF8; BYTE $0x07;
#define PSLLO_X1_7 BYTE $0x66; BYTE $0x0F; BYTE $0x73; BYTE $0xF9; BYTE $0x07;
// PSRLO aka PSRLDQ - 01100110:00001111:01110011:11011 xmmreg: imm8
#define PSRLO_X3_7 BYTE $0x66; BYTE $0x0F; BYTE $0x73; BYTE $0xDB; BYTE $0x07;
#define PSRLO_X3_4 BYTE $0x66; BYTE $0x0F; BYTE $0x73; BYTE $0xDB; BYTE $0x04;
// PMOVZXBD - 01100110:00001111:00111000:00110001:11 xmmreg1 xmmreg2
#define PMOVZXBD_X3_X0 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x31; BYTE $0xC3;
#define PMOVZXBD_X3_X1 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x31; BYTE $0xCB;
#define PMOVZXBD_X3_X2 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x31; BYTE $0xD3;
// PMULLD - 01100110:00001111:00111000:01000000:11 xmmreg1 xmmreg2
#define PMULLD_X0_X0 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x40; BYTE $0xC0
#define PMULLD_X1_X1 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x40; BYTE $0xC9
#define PMULLD_X2_X2 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x40; BYTE $0xD2
// PHADDD - 01100110:00001111:00111000:00000010:11 xmmreg1 xmmreg2
#define PHADDD_X0_X0 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x02; BYTE $0xC0
#define PHADDD_X1_X1 BYTE $0x66; BYTE $0x0F; BYTE $0x38; BYTE $0x02; BYTE $0xC9
// PADDD - 66 0F FE /r
#define PADDD_X1_X0 BYTE $0x66; BYTE $0x0F; BYTE $0xFE; BYTE $0xC1
#define PADDD_X2_X0 BYTE $0x66; BYTE $0x0F; BYTE $0xFE; BYTE $0xC2
// CVTDQ2PS - 00001111:01011011:11 xmmreg1 xmmreg2
#define CVTDQ2PS_X0_X0 BYTE $0x0F; BYTE $0x5B; BYTE $0xC0;
//// Env3x3Gray_distance ///////////////////////////////////////////////////////
// func Env3x3Gray_distance(env0, env1 *Env3x3Gray) float32
TEXT ·Env3x3Gray_distance(SB),NOSPLIT,$0-24
MOVQ env0+0(FP), AX // get env0 pointer
MOVQ env1+8(FP), BX // get env1 pointer
// load 16 bytes of data per env (7 too many)
LDOU_AX_X0 /* MOVOU (AX), X0 */
LDOU_BX_X1 /* MOVOU (BX), X1 */
// keep bytes [0..8], clear bytes[9..15], by left shifting
PSLLO_X0_7 // shift left 7 bytes
PSLLO_X1_7 // shift left 7 bytes
// get pairwise maxima and minima
MOVOA X1, X2
PMINUB X0, X2 // minima in X2
MOVOA X1, X3
PMAXUB X0, X3 // maxima in X3
// compute deltas
PSUBB X2, X3 // deltas (max - min) in X3
PSRLO_X3_7 // shift right to align
// widen deltas (uint8 -> int32), 4 elements per operation
PMOVZXBD_X3_X0
PSRLO_X3_4 // shift right 4 bytes
PMOVZXBD_X3_X1
PSRLO_X3_4 // shift right 4 bytes
PMOVZXBD_X3_X2
// square deltas
PMULLD_X0_X0
PMULLD_X1_X1
PMULLD_X2_X2
// add deltas (X2 has just one element, needs no horizontal add)
PHADDD_X0_X0
PHADDD_X0_X0
PHADDD_X1_X1
PHADDD_X1_X1
PADDD_X1_X0
PADDD_X2_X0
// final floating point processing
CVTDQ2PS_X0_X0 // int32 -> float32
SQRTSS X0, X0 // square root
// return normalized distance
MOVSS $0.001307189542484, X1 // factor: 1.0/sqrt(9 * 255*255)
MULSS X1, X0
MOVSS X0, ret+16(FP)
RET