$ g++ -mavx512f -mfma -fabi-version=0 -fopenmp -O3 main.cpp ~/SAFE/benchmark/build/src/libbenchmark.a -I ~/SAFE/benchmark/include/ -o main
$ for ((i=1;i<=10;i++)); do export OMP_NUM_THREADS=$i; likwid-perfctr -g FLOPS_DP -C 0-9 ./main 2>&1 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v "STAT"; done
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 | 640064 |
$ g++-7 -mavx512f -mfma -fabi-version=0 -fopenmp -O3 main.cpp -lbenchmark -o main
$ for ((i=1;i<=10;i++)); do export OMP_NUM_THREADS=$i; likwid-perfctr -g FLOPS_DP -f -C 0-9 ./main 2>&1 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v "STAT"; done
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
$ for ((i=1;i<=10;i++)); do export OMP_NUM_THREADS=$i; likwid-perfctr -g FLOPS_DP -m -C 0-9 ./main 2>&1 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v "STAT"; done
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
- vectorclass is a wrapper for SIMD types by Agner Fog (<a href="https://www.agner.org/optimize/#vectorclass" rel="nofollow" target="_blank" onmousedown="this.href='https://www.google.com/url?q\x3dhttps%3A%2F%2Fwww.agner.org%2Foptimize%2F%23vectorclass\x26sa\x3dD\x26sntz\x3d1\x26usg\x3dAFQjCNFaICphloC8HG5C1EW3jVp7g2yC7A';return true;" onclick="this.href='https://www.google.com/url?q\x3dhttps%3A%2F%2
likwid-perfctr -- Version 4.3.4 (commit: 233ab943543480cd46058b34616c174198ba0459)
$ likwid-perfctr -g FLOPS_DP -m -f -C 0-5 likwid-bench -t stream_avx512 -w M0:400KB:2 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v 'STAT'
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 |
I'm using
likwid-perfctr -- Version 4.3.4 (commit: 233ab943543480cd46058b34616c174198ba0459)but I also tried version 4.3.3 with the same results.
Also, I've recognized that this problem is not confined to this one program, it happens for any multithreaded program, even the likwid benchmarks.In the likwid benchmarks case it is even a bit worse. If the list of processes, I pin the benchmark to, is larger than the number of threads specified in the work group, no AVX512 events are counted, e.g.
$ likwid-perfctr -g FLOPS_DP -m -f -C 0-5 likwid-bench -t stream_avx512 -w M0:400KB:2 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v 'STAT'
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 |
$ likwid-perfctr -g FLOPS_DP -m -f -C 0-5 likwid-bench -t stream_avx512 -w M0:400KB:2 | grep "FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE" | grep -v 'STAT'
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 545259500 | 545259500 |
Again, this behaviour does not appear for AVX2 events.
SETUP_PMC [cpu=0] Register 0x189 , Flags: 0x4140C7
SETUP_PMC [cpu=1] Register 0x189 , Flags: 0x4140C7
UNFREEZE_PMC_AND_FIXED [cpu=0] Register 0x38F , Flags: 0x70000000F
UNFREEZE_PMC_AND_FIXED [cpu=1] Register 0x38F , Flags: 0x70000000F
READ_PMC [cpu=0] Register 0xC4 , Flags: 0x0
READ_PMC [cpu=1] Register 0xC4 , Flags: 0x2625A000
$ likwid-perfctr -g ALL_AVX512 -m -C 0-5 main | grep '512B' | grep -v 'STAT'
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC0 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC1 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC2 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 | 51200000 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 0 | 0 | 0 | 0 | 0 | 0 |
Can you say if that is a software or a hardware fault?
SKX90:
Performance Monitoring General Purpose Counter 3 May Contain Unexpected Values
Problem: When Restricted Transactional Memory (RTM) is supported (CPUID.07H.EBX.RTM [bit 11] = 1) and when TSX_FORCE_ABORT=0, Performance Monitor Unit (PMU) general purpose counter 3 (IA32_PMC3, MSR C4H and IA32_A_PMC3, MSR 4C4H) may contain unexpected values. Further, IA32_PREFEVTSEL3 (MSR 189H) may also contain unexpected configuration values.
Implication: Due to this erratum, software that uses PMU general purposes counter 3 may read anunexpected count and configuration.
Workaround: Software can avoid this erratum by writing 1 to bit 0 of TSX_FORCE_ABORT (MSR 10FH) which will cause all Restricted Transactional Memory (RTM) transactions to abort with EAX code 0. TSX_FORCE_ABORT MSR is available when CPUID.07H.EDX [bit 13]=1.
Status: No fix.
TSX_FORCE_ABORT
register value is zero. I'm not sure whether LIKWID should abort all RTM transactions.