YUV colorspace conversion using NEON intrinsics

3,502 views
Skip to first unread message

Rahul

unread,
Oct 2, 2011, 8:41:41 PM10/2/11
to android-opencv
Greetings!

I just wanted to inquire if anyone had tried the yuvtorgb color space
conversion using neon intrinsics instead of the "cvtColor(myuv, mrgb,
CV_YUV420sp2RGB,3);" function .

I had found a similar function on :http://build.shr-project.org/
sources/svn/code.ros.org/svn/opencv/branches/2.2/opencv/android/
android-jni/jni/yuv2rgb_neon.c

It goes like :

void color_convert_common(unsigned char *pY, unsigned char *pUV, int
width, int height, unsigned char *buffer, int grey)
{

int i, j;
int nR, nG, nB;
int nY, nU, nV;
unsigned char *out = buffer;
int offset = 0;

uint8x8_t Y_SUBvec = vld1_u8(Y_SUBS);
uint8x8_t UV_SUBvec = vld1_u8(UV_SUBS); // v,u,v,u v,u,v,u
uint32x4_t UV_MULSvec = vld1q_u32(UV_MULS);
uint8x8_t ZEROSvec =vld1_u8(ZEROS);

uint32_t UVvec_int[8];
if (grey)
{
memcpy(out, pY, width * height * sizeof(unsigned char));
}
else
// YUV 4:2:0
for (i = 0; i < height; i++)
{
for (j = 0; j < width; j += 8)
{
// nY = *(pY + i * width + j);
// nV = *(pUV + (i / 2) * width + bytes_per_pixel *
(j / 2));
// nU = *(pUV + (i / 2) * width + bytes_per_pixel *
(j / 2) + 1);

uint8x8_t nYvec = vld1_u8(LOAD_Y(i,j));
uint8x8_t nUVvec = vld1_u8(LOAD_V(i,j)); // v,u,v,u v,u,v,u

nYvec = vmul_u8(nYvec, vcle_u8(nYvec,ZEROSvec));

// Yuv Convert
// nY -= 16;
// nU -= 128;
// nV -= 128;

// nYvec = vsub_u8(nYvec, Y_SUBvec);
// nUVvec = vsub_u8(nYvec, UV_SUBvec);

uint16x8_t nYvec16 = vmovl_u8(vsub_u8(nYvec, Y_SUBvec));
uint16x8_t nUVvec16 = vmovl_u8(vsub_u8(nYvec, UV_SUBvec));

uint16x4_t Y_low4 = vget_low_u16(nYvec16);
uint16x4_t Y_high4 = vget_high_u16(nYvec16);
uint16x4_t UV_low4 = vget_low_u16(nUVvec16);
uint16x4_t UV_high4 = vget_high_u16(nUVvec16);

uint32x4_t UV_low4_int = vmovl_u16(UV_low4);
uint32x4_t UV_high4_int = vmovl_u16(UV_high4);

uint32x4_t Y_low4_int = vmull_n_u16(Y_low4, 1192);
uint32x4_t Y_high4_int = vmull_n_u16(Y_high4, 1192);

uint32x4x2_t UV_uzp = vuzpq_u32(UV_low4_int, UV_high4_int);

uint32x2_t Vl = vget_low_u32(UV_uzp.val[0]);//
vld1_u32(UVvec_int);
uint32x2_t Vh = vget_high_u32(UV_uzp.val[0]);//
vld1_u32(UVvec_int + 2);

uint32x2x2_t Vll_ = vzip_u32(Vl, Vl);
uint32x4_t* Vll = (uint32x4_t*)(&Vll_);

uint32x2x2_t Vhh_ = vzip_u32(Vh, Vh);
uint32x4_t* Vhh = (uint32x4_t*)(&Vhh_);

uint32x2_t Ul = vget_low_u32(UV_uzp.val[1]);
uint32x2_t Uh = vget_high_u32(UV_uzp.val[1]);

uint32x2x2_t Ull_ = vzip_u32(Ul, Ul);
uint32x4_t* Ull = (uint32x4_t*)(&Ull_);

uint32x2x2_t Uhh_ = vzip_u32(Uh, Uh);
uint32x4_t* Uhh = (uint32x4_t*)(&Uhh_);

uint32x4_t B_int_low = vmlaq_n_u32(Y_low4_int, *Ull, 2066); //
multiply by scalar accum
uint32x4_t B_int_high = vmlaq_n_u32(Y_high4_int, *Uhh,
2066); //multiply by scalar accum
uint32x4_t G_int_low = vsubq_u32(Y_low4_int,
vmlaq_n_u32(vmulq_n_u32(*Vll, 833), *Ull, 400));
uint32x4_t G_int_high = vsubq_u32(Y_high4_int,
vmlaq_n_u32(vmulq_n_u32(*Vhh, 833), *Uhh, 400));
uint32x4_t R_int_low = vmlaq_n_u32(Y_low4_int, *Vll, 1634); //
multiply by scalar accum
uint32x4_t R_int_high = vmlaq_n_u32(Y_high4_int, *Vhh,
1634); //multiply by scalar accum

B_int_low = vshrq_n_u32 (B_int_low, 10);
B_int_high = vshrq_n_u32 (B_int_high, 10);
G_int_low = vshrq_n_u32 (G_int_low, 10);
G_int_high = vshrq_n_u32 (G_int_high, 10);
R_int_low = vshrq_n_u32 (R_int_low, 10);
R_int_high = vshrq_n_u32 (R_int_high, 10);


uint8x8x3_t RGB;
RGB.val[0] = vmovn_u16(vcombine_u16(vqmovn_u32
(R_int_low),vqmovn_u32 (R_int_high)));
RGB.val[1] = vmovn_u16(vcombine_u16(vqmovn_u32
(G_int_low),vqmovn_u32 (G_int_high)));
RGB.val[2] = vmovn_u16(vcombine_u16(vqmovn_u32
(B_int_low),vqmovn_u32 (B_int_high)));

vst3_u8 (out+i*width*3 + j*3, RGB);
}
}

}


The function seemed correct to me and I ran my sample program (which
was running fine with the original implementation).However,the colors
are wrong and I am not sure why exactly is that happening .

Does anyone have any ideas what could be the error.I have read the
code and I am not sure what the error is .Any help would be greatly
appreciated .Also ,is there an option in OpenCV to enable neon (it
said opencv 2.2 /android in the link ,that is why i asked the
qn :) ) .

Thanks a lot !

Cheers,
Rahul

Shervin Emami

unread,
Oct 3, 2011, 7:39:58 AM10/3/11
to android-opencv
It's possible that you are having problems with packed vs planar pixel
formats (OpenCV normally uses packed, but Android camera is planar),
or using BGR vs RGB order (OpenCV normally uses BGR, but Android might
use RGB).

But anyway, OpenCV is still being optimized for NEON (particularly for
Tegra 3 devices), so if you can wait a few more months then this
function and various others should be available using optimized NEON
SIMD code. Otherwise you can try to use the earlier code you mentioned
until OpenCV releases the optimized code officially.

Cheers,
Shervin.

NaterGator

unread,
Oct 3, 2011, 11:38:43 AM10/3/11
to android-opencv
Rahul,
I found code based on Robin Watts' assembly/LUT conversion (http://
wss.co.uk/pinknoise/yuv2rgb/) worked very well and ported it into gnu-
compatible code for use in openCV:
https://github.com/NaterGator/AndroidOpenCV_mods/tree/master/android/jni

I had problems with neon intrinsics performance. Also, there was an
issue where the R and B channels were getting transposed during
conversion. I do not remember if the fix is included in the github
page or not, but be aware that may be the trouble you're seeing with
the neon intrinsics based code.

--Nate

Rahul

unread,
Oct 13, 2011, 10:18:44 PM10/13/11
to android-opencv
Hi Nate,

Sorry for the late reply.

I had used the code on your github page and took some time to
integrate it with my project.I am so glad I did that because there is
a significant speedup in my program :).I tested this on a google nexus
S and timed the neon implementation with the
cvtcolor(src,dest,CV_YUV420sp2BGR,3) function .I am happy to report
that there is a 4x increase in proccessing speed ,all thanks to the
conversion code !

Because I am a beginner,I would like to recommend others like me to
use this code as its awesome!!

Thanks for ur help and gud luk wid FetchGunner :) .

Cheers,
Rahul


On Oct 4, 4:38 am, NaterGator <nweib...@gmail.com> wrote:
> Rahul,
> I found code based on Robin Watts' assembly/LUT conversion (http://
> wss.co.uk/pinknoise/yuv2rgb/) worked very well and ported it into gnu-
> compatible code for use in openCV:https://github.com/NaterGator/AndroidOpenCV_mods/tree/master/android/jni
>
> I had problems withneonintrinsics performance. Also, there was an
> issue where the R and B channels were getting transposed during
> conversion. I do not remember if the fix is included in the github
> page or not, but be aware that may be the trouble you're seeing with
> theneonintrinsics based code.
>
> --Nate
>
> On Oct 2, 8:41 pm, Rahul <rahul.budhiraja.d...@gmail.com> wrote:
>
>
>
>
>
>
>
> > Greetings!
>
> > I just wanted to inquire if anyone had tried the yuvtorgb color space
> > conversion usingneonintrinsics instead of the "cvtColor(myuv, mrgb,

Rouhollah Rahmatizadeh

unread,
Oct 14, 2011, 3:19:39 AM10/14/11
to android...@googlegroups.com
Hey guys,
I used the _yuv420_2_rgb888 function and cvtColor(rgb, bgra, CV_BGR2RGBA, 4) to convert to 4 channel ARGB8888 to create a bitmap but the performance was like cvtColor(yuv, bgra, CV_YUV420sp2BGR, 4). Both about 20 FPS on Galaxy S with 640×480 preview size. How did you got 4x faster processing speed?
Thanks.

Rahul

unread,
Oct 14, 2011, 5:01:30 AM10/14/11
to android-opencv
Hi Rouhollah,

Well I was actually comparing the performance time of cvtcolor v/s
yuv420 function ,not the FPS.On a nexus s it takes about 2 ms by the
neon code compared to 8 ms taken by the cvtcolor function.

Also,I am not sure whether u can increase the camera capturing speed
beyond 15-20 fps ,u cud increase the Rendering speed though .I had
reduced the capture resolution to 480 x 320 and used OpenGL to display
the Textures instead of the Android Canvas.That speeded up my program.

Good luck with ur program !

Cheers!
Rahul



On Oct 14, 8:19 pm, Rouhollah Rahmatizadeh <rrahm...@gmail.com> wrote:
> Hey guys,
> I used the _yuv420_2_rgb888 function and cvtColor(rgb, bgra, CV_BGR2RGBA, 4)
> to convert to 4 channel ARGB8888 to create a bitmap but the performance was
> like cvtColor(yuv, bgra, CV_YUV420sp2BGR, 4). Both about 20 FPS on Galaxy S
> with 640×480 preview size. How did you got 4x faster processing speed?
> Thanks.
>

Rahul

unread,
Oct 17, 2011, 7:31:38 PM10/17/11
to android-opencv
Thnx fer pointing out the possible errors Shervin .I am not sure
whether it was the packed v/s planar pixel problem or whether it was
the R and B channels getting transposed as suggested by Nate but I am
glad that it was solved from the code which he provided .

I was also quite interested when u said that OpenCV was planning to
release a NEON optimized version soon.Was thinking of getting a tegra
and use it for development .Do you have any rough idea of the date?
Just thought it would be nice to know abt it :) .

Thanks for ur help ,

Cheers,
Rahul

Shervin Emami

unread,
Oct 18, 2011, 6:25:04 AM10/18/11
to android-opencv
I'm not sure of any dates. I just know that NVIDIA will eventually
sell a library "NPP" full of optimized versions of OpenCV functions
for ARM devices, just like Intel sell "IPP" for Intel devices. Except
that NPP will be taking advantage both of NEON SIMD & GPU computing
(Tegra GPU, not CUDA). I'm guessing that some of the NEON code would
also be put into the main OpenCV distribution, just like some of the
SSE/SSE2 optimizations are available for free in current OpenCV, but
I'm not sure. Note that current Tegra 2 devices don't have NEON at
all, whereas most smartphones & tablets (including Tegra 3) both now
and in the future will surely include NEON for quite a while!

Cheers,
Shervin Emami.
Mobile Computer Vision Engineer, NVIDIA.
http://www.shervinemami.info/openCV.html

Roman

unread,
Oct 19, 2011, 11:00:01 AM10/19/11
to android-opencv
I found this, looks great, but the price...

http://www.accelereyes.com/products/mobile

Roman


On 18 říj, 12:25, Shervin Emami <shervin.em...@gmail.com> wrote:
> I'm not sure of any dates. I just know that NVIDIA will eventually
> sell a library "NPP" full of optimized versions of OpenCV functions
> for ARM devices, just like Intel sell "IPP" for Intel devices. Except
> that NPP will be taking advantage both of NEON SIMD & GPU computing
> (Tegra GPU, not CUDA). I'm guessing that some of the NEON code would
> also be put into the main OpenCV distribution, just like some of the
> SSE/SSE2 optimizations are available for free in current OpenCV, but
> I'm not sure. Note that current Tegra 2 devices don't have NEON at
> all, whereas most smartphones & tablets (including Tegra 3) both now
> and in the future will surely include NEON for quite a while!
>
> Cheers,
> Shervin Emami.
> Mobile Computer Vision Engineer, NVIDIA.http://www.shervinemami.info/openCV.html
> ...
>
> číst dál »
Reply all
Reply to author
Forward
0 new messages