diff options
author | sr55 <[email protected]> | 2013-09-21 20:16:51 +0000 |
---|---|---|
committer | sr55 <[email protected]> | 2013-09-21 20:16:51 +0000 |
commit | f69b7f1dfc98c90d454078f1a3aabef3bae36fd2 (patch) | |
tree | 4792942c10f6b1c4418228bcc53b648b773d5098 /libhb/oclscale.c | |
parent | 8bccfabca28d059978f1eb8e516592f4e2f06c1a (diff) |
Merging-in the OpenCL Scaling code from the OpenCL branch to trunk.
Patch originally by the Multicoreware Inc team, followed by improvements and fixes by Micheal Wootton from AMD Inc,
OpenCL:
This patch implements Bicubic Scaling in OpenCL.
Note that HandBrake currently uses Lanczos so the performance difference appears to be much more significant. We may offer an option of BiCubic in software later.
Bicubic scaling may appear a bit sharper than the equivalent Lanczos encode and may increase file size a bit. Quality may be better or worse depending on the scaling and content and personal preference towards sharpness.
When comparing performance with a custom HandBrake build that runs Software Bicubic to OpenCL Bicubic, performance increase is about 5~7% on average on a modern GPU.
Hardware Decode via DXVA:
We also have optional DXVA decoding which may come in useful for slower/lower end systems that have a capable GPU.
This is only available on input sources that use the libav decode path.
Most GPU hardware for decoding is designed for playback, so if you are running on a high end CPU, it will bottleneck the encode process.
Requires OpenCL 1.1 or later supporting GPU.
Front end changes and testing framework are not included in this patch. This will be resolved later.
Patch will be revised further before the UI is implemented.
git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@5792 b64f7644-9d1e-0410-96f1-a4d463321fa5
Diffstat (limited to 'libhb/oclscale.c')
-rw-r--r-- | libhb/oclscale.c | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/libhb/oclscale.c b/libhb/oclscale.c new file mode 100644 index 000000000..904183340 --- /dev/null +++ b/libhb/oclscale.c @@ -0,0 +1,271 @@ +/* oclscale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifdef USE_OPENCL
+
+#include <math.h> +#include "common.h" +#include "openclwrapper.h" +#define FILTER_LEN 4 + +#define _A -0.5f + +cl_float cubic(cl_float x) +{ + if (x < 0) + x = -x; + + if (x < 1) + return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1; + else if (x < 2) + return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A); + else + return 0; +} + + +cl_float *hb_bicubic_weights(cl_float scale, int length) +{ + cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4); + + int i; // C rocks + cl_float *out = weights; + for (i = 0; i < length; ++i) + { + cl_float x = i / scale; + cl_float dx = x - (int)x; + *out++ = cubic(-dx - 1.0f); + *out++ = cubic(-dx); + *out++ = cubic(-dx + 1.0f); + *out++ = cubic(-dx + 2.0f); + } + return weights; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv); + +/** +* executive scale using opencl +* get filter args +* create output buffer +* create horizontal filter buffer +* create vertical filter buffer +* create kernels +*/ +int hb_ocl_scale_func( void **data, KernelEnv *kenv ) +{ + cl_int status; + + cl_mem in_buf = data[0]; + cl_mem out_buf = data[1]; + int crop_top = data[2]; + int crop_bottom = data[3]; + int crop_left = data[4]; + int crop_right = data[5]; + int in_frame_w = (int)data[6]; + int in_frame_h = (int)data[7]; + int out_frame_w = (int)data[8]; + int out_frame_h = (int)data[9]; + hb_oclscale_t *os = data[10]; + hb_buffer_t *in = data[11]; + hb_buffer_t *out = data[12]; + + if (os->initialized == 0) + { + hb_log( "Scaling With OpenCL" ); + if (kenv->isAMD != 0) + hb_log( "Using Zero Copy"); + // create the block kernel + cl_int status; + os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status ); + + os->initialized = 1; + } + + { + // Use the new kernel + cl_event events[5]; + int eventCount = 0; + + if (kenv->isAMD == 0) { + status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]); + status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]); + } + + cl_int srcPlaneOffset0 = in->plane[0].data - in->data; + cl_int srcPlaneOffset1 = in->plane[1].data - in->data; + cl_int srcPlaneOffset2 = in->plane[2].data - in->data; + cl_int srcRowWords0 = in->plane[0].stride; + cl_int srcRowWords1 = in->plane[1].stride; + cl_int srcRowWords2 = in->plane[2].stride; + cl_int dstPlaneOffset0 = out->plane[0].data - out->data; + cl_int dstPlaneOffset1 = out->plane[1].data - out->data; + cl_int dstPlaneOffset2 = out->plane[2].data - out->data; + cl_int dstRowWords0 = out->plane[0].stride; + cl_int dstRowWords1 = out->plane[1].stride; + cl_int dstRowWords2 = out->plane[2].stride; + + if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) { + srcPlaneOffset0 += crop_left + crop_top * srcRowWords0; + srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1; + srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2; + in_frame_w = in_frame_w - crop_right - crop_left; + in_frame_h = in_frame_h - crop_bottom - crop_top; + } + + cl_float xscale = (out_frame_w * 1.0f) / in_frame_w; + cl_float yscale = (out_frame_h * 1.0f) / in_frame_h; + setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv); + + OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch + OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights ); + OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights ); + + size_t workOffset[] = { 0, 0, 0 }; + size_t globalWorkSize[] = { 1, 1, 1 }; + size_t localWorkSize[] = { 1, 1, 1 }; + + int xgroups = (out_frame_w + 63) / 64; + int ygroups = (out_frame_h + 15) / 16; + + localWorkSize[0] = 64; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + globalWorkSize[0] = xgroups * 64; + globalWorkSize[1] = ygroups; + globalWorkSize[2] = 3; + + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] ); + ++eventCount; + + if (kenv->isAMD == 0) { + in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status); + out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status); + eventCount += 2; + } + + clFlush(kenv->command_queue); + clWaitForEvents(eventCount, &events[0]); + int i; + for (i = 0; i < eventCount; ++i) + clReleaseEvent(events[i]); + } + + return 1; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { + cl_int status; + if (os->xscale != xscale || os->width < width) { + cl_float *xweights = hb_bicubic_weights(xscale, width); + CL_FREE(os->bicubic_x_weights); + CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); + os->width = width; + os->xscale = xscale; + free(xweights); + } + + if ((os->yscale != yscale) || (os->height < height)) { + cl_float *yweights = hb_bicubic_weights(yscale, height); + CL_FREE(os->bicubic_y_weights); + CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); + os->height = height; + os->yscale = yscale; + free(yweights); + } + return 0; +} + + +/** +* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm +* parameter: +* inputFrameBuffer: the source video frame opencl buffer
+* outputdata: the destination video frame buffer
+* inputWidth: the width of the source video frame
+* inputHeight: the height of the source video frame
+* outputWidth: the width of destination video frame +* outputHeight: the height of destination video frame +*/ + + +static int s_scale_init_flag = 0; + +int do_scale_init() +{ + if ( s_scale_init_flag==0 ) + { + int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func ); + if( !st ) + { + hb_log( "register kernel[%s] failed", "frame_scale" ); + return 0; + } + s_scale_init_flag++; + } + return 1; +} + + +int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os) +{ + void *data[13]; + + if (do_scale_init() == 0) + return 0; + + data[0] = in->cl.buffer; + data[1] = out->cl.buffer; + data[2] = (void*)(crop[0]); + data[3] = (void*)(crop[1]); + data[4] = (void*)(crop[2]); + data[5] = (void*)(crop[3]); + data[6] = (void*)(in->f.width); + data[7] = (void*)(in->f.height); + data[8] = (void*)(out->f.width); + data[9] = (void*)(out->f.height); + data[10] = os; + data[11] = in; + data[12] = out; + + if( !hb_run_kernel( "frame_scale", data ) ) + hb_log( "run kernel[%s] failed", "frame_scale" ); + return 0; +} + + + + + +#endif |