summaryrefslogtreecommitdiffstats
path: root/libhb/oclscale.c
diff options
context:
space:
mode:
Diffstat (limited to 'libhb/oclscale.c')
-rw-r--r--libhb/oclscale.c271
1 files changed, 271 insertions, 0 deletions
diff --git a/libhb/oclscale.c b/libhb/oclscale.c
new file mode 100644
index 000000000..904183340
--- /dev/null
+++ b/libhb/oclscale.c
@@ -0,0 +1,271 @@
+/* oclscale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifdef USE_OPENCL
+
+#include <math.h>
+#include "common.h"
+#include "openclwrapper.h"
+#define FILTER_LEN 4
+
+#define _A -0.5f
+
+cl_float cubic(cl_float x)
+{
+ if (x < 0)
+ x = -x;
+
+ if (x < 1)
+ return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1;
+ else if (x < 2)
+ return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A);
+ else
+ return 0;
+}
+
+
+cl_float *hb_bicubic_weights(cl_float scale, int length)
+{
+ cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4);
+
+ int i; // C rocks
+ cl_float *out = weights;
+ for (i = 0; i < length; ++i)
+ {
+ cl_float x = i / scale;
+ cl_float dx = x - (int)x;
+ *out++ = cubic(-dx - 1.0f);
+ *out++ = cubic(-dx);
+ *out++ = cubic(-dx + 1.0f);
+ *out++ = cubic(-dx + 2.0f);
+ }
+ return weights;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv);
+
+/**
+* executive scale using opencl
+* get filter args
+* create output buffer
+* create horizontal filter buffer
+* create vertical filter buffer
+* create kernels
+*/
+int hb_ocl_scale_func( void **data, KernelEnv *kenv )
+{
+ cl_int status;
+
+ cl_mem in_buf = data[0];
+ cl_mem out_buf = data[1];
+ int crop_top = data[2];
+ int crop_bottom = data[3];
+ int crop_left = data[4];
+ int crop_right = data[5];
+ int in_frame_w = (int)data[6];
+ int in_frame_h = (int)data[7];
+ int out_frame_w = (int)data[8];
+ int out_frame_h = (int)data[9];
+ hb_oclscale_t *os = data[10];
+ hb_buffer_t *in = data[11];
+ hb_buffer_t *out = data[12];
+
+ if (os->initialized == 0)
+ {
+ hb_log( "Scaling With OpenCL" );
+ if (kenv->isAMD != 0)
+ hb_log( "Using Zero Copy");
+ // create the block kernel
+ cl_int status;
+ os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status );
+
+ os->initialized = 1;
+ }
+
+ {
+ // Use the new kernel
+ cl_event events[5];
+ int eventCount = 0;
+
+ if (kenv->isAMD == 0) {
+ status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]);
+ status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]);
+ }
+
+ cl_int srcPlaneOffset0 = in->plane[0].data - in->data;
+ cl_int srcPlaneOffset1 = in->plane[1].data - in->data;
+ cl_int srcPlaneOffset2 = in->plane[2].data - in->data;
+ cl_int srcRowWords0 = in->plane[0].stride;
+ cl_int srcRowWords1 = in->plane[1].stride;
+ cl_int srcRowWords2 = in->plane[2].stride;
+ cl_int dstPlaneOffset0 = out->plane[0].data - out->data;
+ cl_int dstPlaneOffset1 = out->plane[1].data - out->data;
+ cl_int dstPlaneOffset2 = out->plane[2].data - out->data;
+ cl_int dstRowWords0 = out->plane[0].stride;
+ cl_int dstRowWords1 = out->plane[1].stride;
+ cl_int dstRowWords2 = out->plane[2].stride;
+
+ if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) {
+ srcPlaneOffset0 += crop_left + crop_top * srcRowWords0;
+ srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1;
+ srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2;
+ in_frame_w = in_frame_w - crop_right - crop_left;
+ in_frame_h = in_frame_h - crop_bottom - crop_top;
+ }
+
+ cl_float xscale = (out_frame_w * 1.0f) / in_frame_w;
+ cl_float yscale = (out_frame_h * 1.0f) / in_frame_h;
+ setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv);
+
+ OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch
+ OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights );
+
+ size_t workOffset[] = { 0, 0, 0 };
+ size_t globalWorkSize[] = { 1, 1, 1 };
+ size_t localWorkSize[] = { 1, 1, 1 };
+
+ int xgroups = (out_frame_w + 63) / 64;
+ int ygroups = (out_frame_h + 15) / 16;
+
+ localWorkSize[0] = 64;
+ localWorkSize[1] = 1;
+ localWorkSize[2] = 1;
+ globalWorkSize[0] = xgroups * 64;
+ globalWorkSize[1] = ygroups;
+ globalWorkSize[2] = 3;
+
+ OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] );
+ ++eventCount;
+
+ if (kenv->isAMD == 0) {
+ in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status);
+ out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status);
+ eventCount += 2;
+ }
+
+ clFlush(kenv->command_queue);
+ clWaitForEvents(eventCount, &events[0]);
+ int i;
+ for (i = 0; i < eventCount; ++i)
+ clReleaseEvent(events[i]);
+ }
+
+ return 1;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) {
+ cl_int status;
+ if (os->xscale != xscale || os->width < width) {
+ cl_float *xweights = hb_bicubic_weights(xscale, width);
+ CL_FREE(os->bicubic_x_weights);
+ CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL );
+ os->width = width;
+ os->xscale = xscale;
+ free(xweights);
+ }
+
+ if ((os->yscale != yscale) || (os->height < height)) {
+ cl_float *yweights = hb_bicubic_weights(yscale, height);
+ CL_FREE(os->bicubic_y_weights);
+ CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL );
+ os->height = height;
+ os->yscale = yscale;
+ free(yweights);
+ }
+ return 0;
+}
+
+
+/**
+* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
+* parameter:
+* inputFrameBuffer: the source video frame opencl buffer
+* outputdata: the destination video frame buffer
+* inputWidth: the width of the source video frame
+* inputHeight: the height of the source video frame
+* outputWidth: the width of destination video frame
+* outputHeight: the height of destination video frame
+*/
+
+
+static int s_scale_init_flag = 0;
+
+int do_scale_init()
+{
+ if ( s_scale_init_flag==0 )
+ {
+ int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func );
+ if( !st )
+ {
+ hb_log( "register kernel[%s] failed", "frame_scale" );
+ return 0;
+ }
+ s_scale_init_flag++;
+ }
+ return 1;
+}
+
+
+int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os)
+{
+ void *data[13];
+
+ if (do_scale_init() == 0)
+ return 0;
+
+ data[0] = in->cl.buffer;
+ data[1] = out->cl.buffer;
+ data[2] = (void*)(crop[0]);
+ data[3] = (void*)(crop[1]);
+ data[4] = (void*)(crop[2]);
+ data[5] = (void*)(crop[3]);
+ data[6] = (void*)(in->f.width);
+ data[7] = (void*)(in->f.height);
+ data[8] = (void*)(out->f.width);
+ data[9] = (void*)(out->f.height);
+ data[10] = os;
+ data[11] = in;
+ data[12] = out;
+
+ if( !hb_run_kernel( "frame_scale", data ) )
+ hb_log( "run kernel[%s] failed", "frame_scale" );
+ return 0;
+}
+
+
+
+
+
+#endif