On Thursday, November 17, 2016 at 6:18:53 PM UTC+2,
already...@yahoo.com wrote:
> On Thursday, November 17, 2016 at 1:59:36 AM UTC+2,
already...@yahoo.com wrote:
> >
> > One possibility is a combination of contention over execution port 1 (5 instructions per iteration) with the minDist_v latency chain:
> > __m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ);
> > minDist_v = _mm256_blendv_pd(minDist_v, dist_v, le_v);
> > Theoretically, a latency chain is only 4 clocks long, but with contention it could become worse.
>
> Turns out, I am on the right track, but intervention of compiler further complicates things.
>
> 'gcc -O3 -mavx -march=ivybridge' reorders instructions in the inner loop in such way that _mm256_blendv_pd(minDist_vh, minDist_v, le_v) appears after
> minK_v = _mm256_blendv_pd(minK_v, k_v, le_v);
> k_v = _mm256_add_pd(k_v, kInc_v);
>
> That makes dependency chain problem on Haswell much harder. Strangely, it only hurts Haswell. On IvyB it is actually a little faster than an order in which I coded it.
>
> 'gcc -O1 -mavx -march=ivybridge' does more or less what I told it to do.
> As I above said, on IvyB it makes things a little slower (~4M clocks slower), but on HSWL it runs more than a little faster (~20M clocks faster).
>
>
> >
> > Replacing the second mul and add with FMA will ease up a port 1 contation, because FMA can be executed on port 0, but my IvyB has no FMA, so I can't test it.
> >
>
> With -O3 it makes no difference.
> With -O1 it improves performance just a little over original with '-O1 -march=ivybridge'. Approximately 1M clocks - quite disappointing, less then 0.1 clock per iteration.
>
>
> > Another opportunity in the spirit of "work harder not smarter" is shortening a latency chain by mean of replacing "easy" single-clock
> > minDist_v = _mm256_blendv_pd(minDist_v, dist_v, le_v);
> > with "harder" 3-clock
> > minDist_v = _mm256_min_pd(minDist_v, dist_v);
> >
> > On my IvyB it gives a nice speed up. The resulting branchless version is nearly equal in speed to your best "branchy".
> >
>
> On Haswell it also gives a nice speedup over original '-O3'. Approximately 13M clocks. Which still leaves it 7M clocks behind '-O1'.
>
> But when we do both _mm256_min_pd() and _mm256_fmadd_pd() then things suddenly improve: ~25M clocks faster than original -03 and only 5M clocks slower than Anton's branchy variant.
>
The next round of tweaks that is started by a question: if we are already using FMA(3) then why can't we use AVX2 as well? AFAIK, so far all processors that supported FMA3 also supported AVX2.
So, Hello, AVX2! That's a first time I am coding for you!
The only change over the previous version is that k_v update now uses integer arithmetic. One more instruction off the back of port 1.
And it helps to shave 11M clocks from the execution time! Enough for a branchless variant to become faster than the branchy.
So, so far, our winner on Haswell looks like that:
// tourx and toury aligned on 32-byte boundary
void tsp(point cities[], double tourx[], double toury[], int ncities)
{
int i,kk;
tourx[0] = cities[ncities-1].x;
toury[0] = cities[ncities-1].y;
for (i=1; i<ncities; i++) {
tourx[i]=cities[i-1].x;
toury[i]=cities[i-1].y;
}
for (i=1; i<ncities; i++) {
double ThisX = tourx[i-1];
double ThisY = toury[i-1];
double CloseDist = DBL_MAX;
int min_k = i;
int k = i;
if (ncities - k >= 8) {
// scalar prologue
int nlast = (k + 3) & (-4);
for (; k < nlast; ++k) {
double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY);
if (ThisDist <= CloseDist) {
CloseDist = ThisDist;
min_k = k;
}
}
// main loop
int ni = (ncities - k) / 4;
__m256d thisx_v = _mm256_set1_pd(ThisX);
__m256d thisy_v = _mm256_set1_pd(ThisY);
__m256d minDist_v = _mm256_set1_pd(CloseDist);
__m256d minK_v = _mm256_castsi256_pd(_mm256_set1_epi64x(min_k)); // __m256i really, just defined as __m256d to reduce # of casts
__m256i k_v = _mm256_setr_epi64x((k+0), (k+1), (k+2), (k+3));
__m256i kInc_v = _mm256_set1_epi64x(4);
const __m256d* tourx_v = (const __m256d*)&tourx[k];
const __m256d* toury_v = (const __m256d*)&toury[k];
k += ni*4;
for (kk = 0; kk < ni; ++kk) {
__m256d dx_v = _mm256_sub_pd(tourx_v[kk], thisx_v);
__m256d dy_v = _mm256_sub_pd(toury_v[kk], thisy_v);
_mm_prefetch(&tourx_v[kk+10], 0);
_mm_prefetch(&toury_v[kk+10], 0);
__m256d dist_v = _mm256_fmadd_pd(dy_v, dy_v, _mm256_mul_pd(dx_v, dx_v));
__m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ);
minDist_v = _mm256_min_pd(minDist_v, dist_v);
minK_v = _mm256_blendv_pd(minK_v, _mm256_castsi256_pd(k_v), le_v);
k_v = _mm256_add_epi64(k_v, kInc_v);
}
// fold
__m256d minDist_vh = _mm256_permute2f128_pd(minDist_v, minDist_v, 1);
__m256d minK_vh = _mm256_permute2f128_pd(minK_v, minK_v, 1);
__m256d le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ);
minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v);
minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v);
minDist_vh = _mm256_permute_pd(minDist_v, 1);
minK_vh = _mm256_permute_pd(minK_v, 1);
le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ);
minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v);
minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v);
_mm_store_sd(&CloseDist, _mm256_castpd256_pd128(minDist_v));
uint64_t min_kd;
memcpy(&min_kd, &minK_v, sizeof(uint64_t));
min_k = (int)min_kd;
}
// scalar epilogue
for (; k < ncities; ++k) {
double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY);
if (ThisDist <= CloseDist) {
CloseDist = ThisDist;
min_k = k;
}
}
swap(&tourx[i],&tourx[min_k]);
swap(&toury[i],&toury[min_k]);
}
}
For reference: the best branchless variant for IvyB (on IvyB it runs at about the same speed as branchy, but it sucks on Hswl).
// tourx and toury aligned on 32-byte boundary
void tsp(point cities[], double tourx[], double toury[], int ncities)
{
tourx[0] = cities[ncities-1].x;
toury[0] = cities[ncities-1].y;
for (int i=1; i<ncities; i++) {
tourx[i]=cities[i-1].x;
toury[i]=cities[i-1].y;
}
for (int i=1; i<ncities; i++) {
double ThisX = tourx[i-1];
double ThisY = toury[i-1];
double CloseDist = DBL_MAX;
int min_k = i;
int k = i;
if (ncities - k >= 8) {
// scalar prologue
int nlast = (k + 3) & (-4);
for (; k < nlast; ++k) {
double ThisDist = sqr(tourx[k]-ThisX)+sqr(toury[k]-ThisY);
if (ThisDist <= CloseDist) {
CloseDist = ThisDist;
min_k = k;
}
}
// main loop
int ni = (ncities - k) / 4;
__m256d thisx_v = _mm256_set1_pd(ThisX);
__m256d thisy_v = _mm256_set1_pd(ThisY);
__m256d minDist_v = _mm256_set1_pd(CloseDist);
__m256d minK_v = _mm256_set1_pd((double)min_k);
__m256d k_v = _mm256_setr_pd((double)(k+0), (double)(k+1), (double)(k+2), (double)(k+3));
__m256d kInc_v = _mm256_set1_pd((double)4);
const __m256d* tourx_v = (const __m256d*)&tourx[k];
const __m256d* toury_v = (const __m256d*)&toury[k];
k += ni*4;
for (int kk = 0; kk < ni; ++kk) {
__m256d dx_v = _mm256_sub_pd(tourx_v[kk], thisx_v);
__m256d dy_v = _mm256_sub_pd(toury_v[kk], thisy_v);
_mm_prefetch(&tourx_v[kk+10], 0);
_mm_prefetch(&toury_v[kk+10], 0);
__m256d dist_v = _mm256_add_pd(_mm256_mul_pd(dx_v, dx_v),_mm256_mul_pd(dy_v, dy_v));
__m256d le_v = _mm256_cmp_pd(dist_v, minDist_v, _CMP_LE_OQ);
minDist_v = _mm256_min_pd(minDist_v, dist_v);
minK_v = _mm256_blendv_pd(minK_v, k_v, le_v);
k_v = _mm256_add_pd(k_v, kInc_v);
}
// fold
__m256d minDist_vh = _mm256_permute2f128_pd(minDist_v, minDist_v, 1);
__m256d minK_vh = _mm256_permute2f128_pd(minK_v, minK_v, 1);
__m256d le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ);
minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v);
minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v);
minDist_vh = _mm256_permute_pd(minDist_v, 1);
minK_vh = _mm256_permute_pd(minK_v, 1);
le_v = _mm256_cmp_pd(minDist_v, minDist_vh, _CMP_LE_OQ);
minDist_v = _mm256_blendv_pd(minDist_vh, minDist_v, le_v);
minK_v = _mm256_blendv_pd(minK_vh, minK_v, le_v);
_mm_store_sd(&CloseDist, _mm256_castpd256_pd128(minDist_v));
double min_kd;
_mm_store_sd(&min_kd, _mm256_castpd256_pd128(minK_v));
min_k = (int)min_kd;
}
// scalar epilogue