I'm going to get the index of max element in float<4> in avx2-i32x8.
and here is my code.
export uniform int YetAnother(uniform float<4>& InValue)
{
varying float src;
*((uniform float<4> *uniform)&src) = *((uniform float<4> *uniform)&InValue);
varying float max = reduce_max(src);
uniform int IMMask;
if(src == max)
{
IMMask = lanemask();
}
return count_trailing_zeros(IMMask);
}
i get the asm by this command
ispc -O2 -g --target=avx2-i32x8 --emit-asm
and the asm code like below
.def YetAnother;
.scl 2;
.type 32;
.endef
.globl YetAnother # -- Begin function YetAnother
.p2align 4, 0x90
YetAnother: # @YetAnother
.Lfunc_begin14:
.cv_func_id 39
# %bb.0: # %allocas
#DEBUG_VALUE: YetAnother:InValue <- undef
.cv_inline_site_id 40 within 39 inlined_at 1 0 0
.cv_loc 40 2 1354 18 # stdlib.ispc:1354:18
vmovaps (%rcx), %xmm0
.Ltmp256:
#DEBUG_VALUE: test <- undef
#DEBUG_VALUE: __mask <- undef
#DEBUG_VALUE: reduce_max:v <- $ymm0
vmaxps %xmm0, %xmm0, %xmm1
vpermilpd $1, %xmm1, %xmm2 # xmm2 = xmm1[1,0]
vmaxps %xmm1, %xmm2, %xmm1
vmovshdup %xmm1, %xmm2 # xmm2 = xmm1[1,1,3,3]
vmaxss %xmm1, %xmm2, %xmm1
.Ltmp257:
#DEBUG_VALUE: result <- $xmm1
#DEBUG_VALUE: __mask <- undef
vbroadcastss %xmm1, %ymm1
.Ltmp258:
#DEBUG_VALUE: __mask <- undef
#DEBUG_VALUE: max <- $ymm1
vcmpeqps %ymm1, %ymm0, %ymm0
.Ltmp259:
vmovmskps %ymm0, %eax
.Ltmp260:
#DEBUG_VALUE: __mask <- undef
#DEBUG_VALUE: count_trailing_zeros:v <- $eax
#DEBUG_VALUE: IMMask <- $eax
#DEBUG_VALUE: src <- undef
#DEBUG_VALUE: __mask <- undef
.cv_inline_site_id 41 within 39 inlined_at 1 196 9
.cv_loc 41 2 877 12 # stdlib.ispc:877:12
tzcntl %eax, %eax
.Ltmp261:
#DEBUG_VALUE: iflt_neg_max <- undef
#DEBUG_VALUE: __mask <- undef
.cv_loc 39 1 196 9 # simple.ispc:196:9
vzeroupper
.Ltmp262:
retq
.Ltmp263:
.Lfunc_end14:
# -- End function
I' m wondering if it is the best way to do that.
and the asm use ymm1 register.
how can i use only xmm1 to do vcmpeqps and vmovmskps.
I don‘t want to change target to avx2-i32x4
I want't to keep the TARGET_WIDTH = 8 to get other function in my programe effecient