StackOverflow2013

Note that there are some explanatory texts on larger screens.

plurals

PO
text
Body
copied!<p>Even though this question was asked 2 years ago, i think some working code would help here. In terms of the initial concerns about bad performance when directly accessing 8-bit values, it's better to perform 32-bit direct access when possible.</p> <p>Some time ago I've developed and used the following OpenCL kernel to convert ARGB (typical windows bitmap pixel layout) to the y-plane (full sized), u/v-half-plane (quarter sized) memory layout as input for libx264 encoding.</p> <pre class="lang-cpp prettyprint-override"><code>__kernel void ARGB2YUV ( __global unsigned int * sourceImage, __global unsigned int * destImage, unsigned int srcHeight, unsigned int srcWidth, unsigned int yuvStride // must be srcWidth/4 since we pack 4 pixels into 1 Y-unit (with 4 y-pixels) ) { int i,j; unsigned int RGBs [ 4 ]; unsigned int posSrc, RGB, Value4 = 0, Value, yuvStrideHalf, srcHeightHalf, yPlaneOffset, posOffset; unsigned char red, green, blue; unsigned int posX = get_global_id(0); unsigned int posY = get_global_id(1); if ( posX < yuvStride ) { // Y plane - pack 4 y's within each work item if ( posY >= srcHeight ) return; posSrc = (posY * srcWidth) + (posX * 4); RGBs [ 0 ] = sourceImage [ posSrc ]; RGBs [ 1 ] = sourceImage [ posSrc + 1 ]; RGBs [ 2 ] = sourceImage [ posSrc + 2 ]; RGBs [ 3 ] = sourceImage [ posSrc + 3 ]; for ( i=0; i<4; i++ ) { RGB = RGBs [ i ]; blue = RGB & 0xff; green = (RGB >> 8) & 0xff; red = (RGB >> 16) & 0xff; Value = ( ( 66 * red + 129 * green + 25 * blue ) >> 8 ) + 16; Value4 |= (Value << (i * 8)); } destImage [ (posY * yuvStride) + posX ] = Value4; return; } posX -= yuvStride; yuvStrideHalf = yuvStride >> 1; // U plane - pack 4 u's within each work item if ( posX >= yuvStrideHalf ) return; srcHeightHalf = srcHeight >> 1; if ( posY < srcHeightHalf ) { posSrc = ((posY * 2) * srcWidth) + (posX * 8); RGBs [ 0 ] = sourceImage [ posSrc ]; RGBs [ 1 ] = sourceImage [ posSrc + 2 ]; RGBs [ 2 ] = sourceImage [ posSrc + 4 ]; RGBs [ 3 ] = sourceImage [ posSrc + 6 ]; for ( i=0; i<4; i++ ) { RGB = RGBs [ i ]; blue = RGB & 0xff; green = (RGB >> 8) & 0xff; red = (RGB >> 16) & 0xff; Value = ( ( -38 * red + -74 * green + 112 * blue ) >> 8 ) + 128; Value4 |= (Value << (i * 8)); } yPlaneOffset = yuvStride * srcHeight; posOffset = (posY * yuvStrideHalf) + posX; destImage [ yPlaneOffset + posOffset ] = Value4; return; } posY -= srcHeightHalf; if ( posY >= srcHeightHalf ) return; // V plane - pack 4 v's within each work item posSrc = ((posY * 2) * srcWidth) + (posX * 8); RGBs [ 0 ] = sourceImage [ posSrc ]; RGBs [ 1 ] = sourceImage [ posSrc + 2 ]; RGBs [ 2 ] = sourceImage [ posSrc + 4 ]; RGBs [ 3 ] = sourceImage [ posSrc + 6 ]; for ( i=0; i<4; i++ ) { RGB = RGBs [ i ]; blue = RGB & 0xff; green = (RGB >> 8) & 0xff; red = (RGB >> 16) & 0xff; Value = ( ( 112 * red + -94 * green + -18 * blue ) >> 8 ) + 128; Value4 |= (Value << (i * 8)); } yPlaneOffset = yuvStride * srcHeight; posOffset = (posY * yuvStrideHalf) + posX; destImage [ yPlaneOffset + (yPlaneOffset >> 2) + posOffset ] = Value4; return; } </code></pre> <p>This code performs only global 32-bit memory access while 8-bit processing happens within each work item.</p> <p>Oh.. and the proper code to invoke the kernel</p> <pre class="lang-c prettyprint-override"><code>unsigned int width = 1024; unsigned int height = 768; unsigned int frameSize = width * height; const unsigned int argbSize = frameSize * 4; // ARGB pixels const unsigned int yuvSize = frameSize + (frameSize >> 1); // Y,U,V planes const unsigned int yuvStride = width >> 2; // since we pack 4 RGBs into "one" YYYY // Allocates ARGB buffer ocl_rgb_buffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, argbSize, 0, &error ); // ... error handling ... ocl_yuv_buffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, yuvSize, 0, &error ); // ... error handling ... error = clSetKernelArg ( kernel, 0, sizeof(cl_mem), &ocl_rgb_buffer ); error |= clSetKernelArg ( kernel, 1, sizeof(cl_mem), &ocl_yuv_buffer ); error |= clSetKernelArg ( kernel, 2, sizeof(unsigned int), &height); error |= clSetKernelArg ( kernel, 3, sizeof(unsigned int), &width); error |= clSetKernelArg ( kernel, 4, sizeof(unsigned int), &yuvStride); // ... error handling ... const size_t local_ws[] = { 16, 16 }; const size_t global_ws[] = { yuvStride + (yuvStride >> 1), height }; error = clEnqueueNDRangeKernel ( queue, kernel, 2, NULL, global_ws, local_ws, 0, NULL, NULL ); // ... error handling ... </code></pre> <p>Note: have a look at the work item calculations. Some additional code needs to be added (e.g. using mod so as to add sufficient spare items) to make sure that work item sizes fit to local work sizes.</p>

Querying!

Guidance

An individual column

Larger individual text columns get their own page to allow for proper reading.

SQuiL has stopped working due to an internal error.

If you are curious you may find further information in the browser console, which is accessible through the devtools (F12).

Reload