After reading https://llvm.org/docs/Vectorizers.html#vectorization-of-function-calls
I decided to write the following C++ program:
#include <cmath>
using v4f32 = float __attribute__((__vector_size__(16)));
v4f32 fct1(v4f32 x)
{
v4f32 y;
y[0] = std::sin(x[0]);
y[1] = std::sin(x[1]);
y[2] = std::sin(x[2]);
y[3] = std::sin(x[3]);
return y;
}
v4f32 fct2(v4f32 x)
{
v4f32 y;
for (int i = 0; i < 4; ++i)
y[i] = std::sin(x[i]);
return y;
}
void fct3(float *x)
{
#pragma clang loop vectorize(enable)
for (int i = 0; i < 16; ++i)
x[i] = sinf(x[i]);
}
Which I compiled with: clang++ -O3 -march=native -mtune=native -c -o
vec.o vec.cc -lmvec -fno-math-errno
And here is what I get:
vec.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z4fct1Dv4_f>:
0: 48 83 ec 48 sub $0x48,%rsp
4: c5 f8 29 04 24 vmovaps %xmm0,(%rsp)
9: e8 00 00 00 00 callq e <_Z4fct1Dv4_f+0xe>
e: c5 f8 29 44 24 30 vmovaps %xmm0,0x30(%rsp)
14: c5 fa 16 04 24 vmovshdup (%rsp),%xmm0
19: e8 00 00 00 00 callq 1e <_Z4fct1Dv4_f+0x1e>
1e: c5 f8 29 44 24 20 vmovaps %xmm0,0x20(%rsp)
24: c4 e3 79 05 04 24 01 vpermilpd $0x1,(%rsp),%xmm0
2b: e8 00 00 00 00 callq 30 <_Z4fct1Dv4_f+0x30>
30: c5 f9 29 44 24 10 vmovapd %xmm0,0x10(%rsp)
36: c4 e3 79 04 04 24 e7 vpermilps $0xe7,(%rsp),%xmm0
3d: e8 00 00 00 00 callq 42 <_Z4fct1Dv4_f+0x42>
42: c5 f8 28 4c 24 30 vmovaps 0x30(%rsp),%xmm1
48: c4 e3 71 21 4c 24 20 vinsertps $0x10,0x20(%rsp),%xmm1,%xmm1
4f: 10
50: c4 e3 71 21 4c 24 10 vinsertps $0x20,0x10(%rsp),%xmm1,%xmm1
57: 20
58: c4 e3 71 21 c0 30 vinsertps $0x30,%xmm0,%xmm1,%xmm0
5e: 48 83 c4 48 add $0x48,%rsp
62: c3 retq
63: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
6a: 00 00 00
6d: 0f 1f 00 nopl (%rax)
0000000000000070 <_Z4fct2Dv4_f>:
70: 48 83 ec 48 sub $0x48,%rsp
74: c5 f8 29 04 24 vmovaps %xmm0,(%rsp)
79: e8 00 00 00 00 callq 7e <_Z4fct2Dv4_f+0xe>
7e: c5 f8 29 44 24 30 vmovaps %xmm0,0x30(%rsp)
84: c5 fa 16 04 24 vmovshdup (%rsp),%xmm0
89: e8 00 00 00 00 callq 8e <_Z4fct2Dv4_f+0x1e>
8e: c5 f8 29 44 24 20 vmovaps %xmm0,0x20(%rsp)
94: c4 e3 79 05 04 24 01 vpermilpd $0x1,(%rsp),%xmm0
9b: e8 00 00 00 00 callq a0 <_Z4fct2Dv4_f+0x30>
a0: c5 f9 29 44 24 10 vmovapd %xmm0,0x10(%rsp)
a6: c4 e3 79 04 04 24 e7 vpermilps $0xe7,(%rsp),%xmm0
ad: e8 00 00 00 00 callq b2 <_Z4fct2Dv4_f+0x42>
b2: c5 f8 28 4c 24 30 vmovaps 0x30(%rsp),%xmm1
b8: c4 e3 71 21 4c 24 20 vinsertps $0x10,0x20(%rsp),%xmm1,%xmm1
bf: 10
c0: c4 e3 71 21 4c 24 10 vinsertps $0x20,0x10(%rsp),%xmm1,%xmm1
c7: 20
c8: c4 e3 71 21 c0 30 vinsertps $0x30,%xmm0,%xmm1,%xmm0
ce: 48 83 c4 48 add $0x48,%rsp
d2: c3 retq
d3: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
da: 00 00 00
dd: 0f 1f 00 nopl (%rax)
00000000000000e0 <_Z4fct3Pf>:
e0: 53 push %rbx
e1: 48 83 ec 10 sub $0x10,%rsp
e5: 48 89 fb mov %rdi,%rbx
e8: c5 fa 10 07 vmovss (%rdi),%xmm0
ec: c5 fa 10 4f 04 vmovss 0x4(%rdi),%xmm1
f1: c5 fa 11 4c 24 0c vmovss %xmm1,0xc(%rsp)
f7: e8 00 00 00 00 callq fc <_Z4fct3Pf+0x1c>
fc: c5 fa 11 03 vmovss %xmm0,(%rbx)
100: c5 fa 10 44 24 0c vmovss 0xc(%rsp),%xmm0
106: e8 00 00 00 00 callq 10b <_Z4fct3Pf+0x2b>
10b: c5 fa 11 43 04 vmovss %xmm0,0x4(%rbx)
110: c5 fa 10 43 08 vmovss 0x8(%rbx),%xmm0
115: e8 00 00 00 00 callq 11a <_Z4fct3Pf+0x3a>
11a: c5 fa 11 43 08 vmovss %xmm0,0x8(%rbx)
11f: c5 fa 10 43 0c vmovss 0xc(%rbx),%xmm0
124: e8 00 00 00 00 callq 129 <_Z4fct3Pf+0x49>
129: c5 fa 11 43 0c vmovss %xmm0,0xc(%rbx)
12e: c5 fa 10 43 10 vmovss 0x10(%rbx),%xmm0
133: e8 00 00 00 00 callq 138 <_Z4fct3Pf+0x58>
138: c5 fa 11 43 10 vmovss %xmm0,0x10(%rbx)
13d: c5 fa 10 43 14 vmovss 0x14(%rbx),%xmm0
142: e8 00 00 00 00 callq 147 <_Z4fct3Pf+0x67>
147: c5 fa 11 43 14 vmovss %xmm0,0x14(%rbx)
14c: c5 fa 10 43 18 vmovss 0x18(%rbx),%xmm0
151: e8 00 00 00 00 callq 156 <_Z4fct3Pf+0x76>
156: c5 fa 11 43 18 vmovss %xmm0,0x18(%rbx)
15b: c5 fa 10 43 1c vmovss 0x1c(%rbx),%xmm0
160: e8 00 00 00 00 callq 165 <_Z4fct3Pf+0x85>
165: c5 fa 11 43 1c vmovss %xmm0,0x1c(%rbx)
16a: c5 fa 10 43 20 vmovss 0x20(%rbx),%xmm0
16f: e8 00 00 00 00 callq 174 <_Z4fct3Pf+0x94>
174: c5 fa 11 43 20 vmovss %xmm0,0x20(%rbx)
179: c5 fa 10 43 24 vmovss 0x24(%rbx),%xmm0
17e: e8 00 00 00 00 callq 183 <_Z4fct3Pf+0xa3>
183: c5 fa 11 43 24 vmovss %xmm0,0x24(%rbx)
188: c5 fa 10 43 28 vmovss 0x28(%rbx),%xmm0
18d: e8 00 00 00 00 callq 192 <_Z4fct3Pf+0xb2>
192: c5 fa 11 43 28 vmovss %xmm0,0x28(%rbx)
197: c5 fa 10 43 2c vmovss 0x2c(%rbx),%xmm0
19c: e8 00 00 00 00 callq 1a1 <_Z4fct3Pf+0xc1>
1a1: c5 fa 11 43 2c vmovss %xmm0,0x2c(%rbx)
1a6: c5 fa 10 43 30 vmovss 0x30(%rbx),%xmm0
1ab: e8 00 00 00 00 callq 1b0 <_Z4fct3Pf+0xd0>
1b0: c5 fa 11 43 30 vmovss %xmm0,0x30(%rbx)
1b5: c5 fa 10 43 34 vmovss 0x34(%rbx),%xmm0
1ba: e8 00 00 00 00 callq 1bf <_Z4fct3Pf+0xdf>
1bf: c5 fa 11 43 34 vmovss %xmm0,0x34(%rbx)
1c4: c5 fa 10 43 38 vmovss 0x38(%rbx),%xmm0
1c9: e8 00 00 00 00 callq 1ce <_Z4fct3Pf+0xee>
1ce: c5 fa 11 43 38 vmovss %xmm0,0x38(%rbx)
1d3: c5 fa 10 43 3c vmovss 0x3c(%rbx),%xmm0
1d8: e8 00 00 00 00 callq 1dd <_Z4fct3Pf+0xfd>
1dd: c5 fa 11 43 3c vmovss %xmm0,0x3c(%rbx)
1e2: 48 83 c4 10 add $0x10,%rsp
1e6: 5b pop %rbx
1e7: c3 retq
As you can see there is no call to a vectorized version of sin.
Did I do something wrong?
By the way I am on Linux with glibc 2.32 which has libmvec.
Regards,
--
Alexandre Bique
_______________________________________________
LLVM Developers mailing list
llvm...@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
clang++ -O3 -march=native -mtune=native \
-Rpass=loop-vectorize,slp-vectorize
-Rpass-missed=loop-vectorize,slp-vectorize
-Rpass-analysis=loop-vectorize,slp-vectorize \
-ffast-math -ffp-model=fast -ffp-exception-behavior=ignore -ffp-contract=fast \
-c -o vec.o vec.cc
But I've got no feedback.
Hi Venkataramanan,
> I am not sure if we can generate vector calls to GlibC libmvec.
Is the support for libmvec on the roadmap? If not how to add it?
> ./clang --autocomplete=-fveclib=
This is amazing, thank you! I think it should be documented in clang
auto vectorization documentation.
Thank you very much,
--
Alexandre BIQUE
On Tue, Sep 1, 2020 at 9:07 AM Venkataramanan Kumar
<venkataraman...@gmail.com> wrote:
> Hi Alexandre,
Hi Venkataramanan,
> I am not sure if we can generate vector calls to GlibC libmvec.
Is the support for libmvec on the roadmap? If not how to add it?
On Sep 1, 2020, at 10:26, Venkataramanan Kumar via llvm-dev <llvm...@lists.llvm.org> wrote:Hi Alexandre,On Tue, 1 Sep, 2020, 2:21 pm Alexandre Bique, <bique.a...@gmail.com> wrote:On Tue, Sep 1, 2020 at 9:07 AM Venkataramanan Kumar
<venkataraman...@gmail.com> wrote:
> Hi Alexandre,
Hi Venkataramanan,
> I am not sure if we can generate vector calls to GlibC libmvec.
Is the support for libmvec on the roadmap? If not how to add it?I also wanted to check with others here, if we can add support for vector routines in Glibc libmvec.Libmvec may not be available in machines which uses lower versions of glibc. We may have to take care of generating calls only when we detect libmvec and also meet required ISA requirements.Otherwise is it ok to generate vector calls under a flag?
Cheers,Florian
Thank you very much! :-)
Did you make progress with libmvec?
Is there a chance that it lands into clang 11?
Regards,
Alexandre Bique
On Tue, Sep 1, 2020 at 11:50 AM Venkataramanan Kumar
<venkataraman...@gmail.com> wrote:
>
> Hi Florian,
>
> On Tue, 1 Sep, 2020, 3:11 pm Florian Hahn, <floria...@apple.com> wrote:
>>
>> Hi,
>>
>> On Sep 1, 2020, at 10:26, Venkataramanan Kumar via llvm-dev <llvm...@lists.llvm.org> wrote:
>>
>> Hi Alexandre,
>>
>> On Tue, 1 Sep, 2020, 2:21 pm Alexandre Bique, <bique.a...@gmail.com> wrote:
>>>
>>> On Tue, Sep 1, 2020 at 9:07 AM Venkataramanan Kumar
>>> <venkataraman...@gmail.com> wrote:
>>> > Hi Alexandre,
>>>
>>> Hi Venkataramanan,
>>>
>>> > I am not sure if we can generate vector calls to GlibC libmvec.
>>> Is the support for libmvec on the roadmap? If not how to add it?
>>
>>
>> I also wanted to check with others here, if we can add support for vector routines in Glibc libmvec.
>>
>> Libmvec may not be available in machines which uses lower versions of glibc. We may have to take care of generating calls only when we detect libmvec and also meet required ISA requirements.
>>
>> Otherwise is it ok to generate vector calls under a flag?
>>
>>
>> The existing vector library variants are enabled with a flag (`-vector-library`) on the LLVM side as mentioned earlier. I think it would make sense to add support for libmvec in a similar fashion. That should be relatively straight-forward.
>
>
> Oh ok, let me try and prepare a patch.
>
> Regards
> Venkat.
>
>>
>> Cheers,
>> Florian
Great, let me know if you need help or don't have time to look at it.
Regards,
Alexandre
On Tue, Sep 15, 2020 at 3:58 PM Venkataramanan Kumar
<venkataraman...@gmail.com> wrote:
> I have not yet started the patch work.
>
> Collected information about the ABI for the routines. To start with I am thinking of supporting pow/lib/exp/sin/cos/log routines till VF 8 which are non masked variants in the initial patch.
> I will try to send the patch early next week.
Great, let me know if you need help or don't have time to look at it.
Regards,
Alexandre
Hi Venkat.,
thanks for adding the support for libmvec!
Is there a pass or something similar to replace calls to
intrinsics that already operate on vector operands with calls to
libmvec?
inject-tli-mappings seems to add the attributes only to scalar
calls to intrinsics such as llvm.exp and the loop-vectorizer then
might vectorize these calls.
In my use case, I'm getting vectorized calls to LLVM intrinsics
from MLIR and would like to replace those with calls to libmvec,
but as the code is already vectorized, the LoopVectorizer does not
do these replacements.
So for example, in the LLVM IR generated from MLIR, there is a call such as:
%103 = fmul <4 x double> %102, <double -5.000000e-01,
double -5.000000e-01, double -5.000000e-01, double
-5.000000e-01>
%104 = tail call <4 x double> @llvm.exp.v4f64(<4 x
double> %103)
I would like to have a pass replace the call to llvm.exp.v4f64 with a call @_ZGVdN4v_exp(<4 x double>), whose declaration is inserted by inject-tli-mappings.
Are you aware of such a pass or another way to achieve this behavior?
Thanks in advance,
Lukas
Lukas Sommer, M.Sc. TU Darmstadt Embedded Systems and Applications Group (ESA) Hochschulstr. 10, 64289 Darmstadt, Germany Phone: +49 6151 1622429 www.esa.informatik.tu-darmstadt.de
Hi Venkat.,
thanks for adding the support for libmvec!
Is there a pass or something similar to replace calls to intrinsics that already operate on vector operands with calls to libmvec?
inject-tli-mappings seems to add the attributes only to scalar calls to intrinsics such as llvm.exp and the loop-vectorizer then might vectorize these calls.
In my use case, I'm getting vectorized calls to LLVM intrinsics from MLIR and would like to replace those with calls to libmvec, but as the code is already vectorized, the LoopVectorizer does not do these replacements.
So for example, in the LLVM IR generated from MLIR, there is a call such as:
%103 = fmul <4 x double> %102, <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>
%104 = tail call <4 x double> @llvm.exp.v4f64(<4 x double> %103)I would like to have a pass replace the call to llvm.exp.v4f64 with a call @_ZGVdN4v_exp(<4 x double>), whose declaration is inserted by inject-tli-mappings.
Are you aware of such a pass or another way to achieve this behavior?
Hi Venkat.,
thanks for your reply and sorry for the long delay.
I went ahead and implemented a pass that performs this transformation as an IR pass: https://reviews.llvm.org/D95373
It would be great if some of you could review that patch and
provide feedback.
Thanks in advance,
Best
Lukas
Lukas Sommer, M.Sc. TU Darmstadt Embedded Systems and Applications Group (ESA) Hochschulstr. 10, 64289 Darmstadt, Germany Phone: +49 6151 1622429 www.esa.informatik.tu-darmstadt.de