/* oclnv12toyuv.c Copyright (c) 2003-2012 HandBrake Team This file is part of the HandBrake source code Homepage: . It may be used under the terms of the GNU General Public License v2. For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html Authors: Peng Gao Li Cao */ #ifdef USE_OPENCL #ifdef USE_HWD #include "vadxva2.h" #include "oclnv12toyuv.h" /** * It creates are opencl bufs w is input frame width, h is input frame height */ static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); /** * It creates are opencl kernel. kernel name is nv12toyuv */ static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ); /** * It set opencl arg, input data,output data, input width, output height */ static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); /** * It initialize nv12 to yuv kernel. */ static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); /** * Run nv12 to yuv kernel. */ static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ); /** * register nv12 to yuv kernel. */ static int hb_nv12toyuv_reg_kernel( void ); /** * It creates are opencl bufs w is input frame width, h is input frame height */ static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) { cl_int status = CL_SUCCESS; int in_bytes = w*h*3/2; CREATEBUF( dxva2->cl_mem_nv12, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, in_bytes ); CREATEBUF( dxva2->cl_mem_yuv, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, in_bytes ); return 0; } /** * It creates are opencl kernel. kernel name is nv12toyuv */ static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ) { int ret; dxva2->nv12toyuv = clCreateKernel( kenv->program, "nv12toyuv", &ret ); return ret; } /** * It set opencl arg, input data,output data, input width, output height */ static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) { int arg = 0, status; kenv->kernel = dxva2->nv12toyuv; OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_nv12 ); OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_yuv ); OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &w ); OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &h ); return 0; } /** * It initialize nv12 to yuv kernel. */ static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) { if( !dxva2->nv12toyuv ) { if( hb_nv12toyuv_create_cl_buf( kenv, w, h, dxva2 ) ) { hb_log( "OpenCL: nv12toyuv_create_cl_buf fail" ); return -1; } if (!dxva2->nv12toyuv_tmp_in) { dxva2->nv12toyuv_tmp_in = malloc (w*h*3/2); } if (!dxva2->nv12toyuv_tmp_out) { dxva2->nv12toyuv_tmp_out = malloc (w*h*3/2); } hb_nv12toyuv_create_cl_kernel( kenv, dxva2 ); } return 0; } /** * copy_plane * @param dst - * @param src - * @param dstride - * @param sstride - * @param h - */ static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, int h ) { if ( dstride == sstride ) { memcpy( dst, src, dstride * h ); return dst + dstride * h; } int lbytes = dstride <= sstride? dstride : sstride; while ( --h >= 0 ) { memcpy( dst, src, lbytes ); src += sstride; dst += dstride; } return dst; } /** * Run nv12 to yuv kernel. */ static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ) { int status; int w = (int)userdata[0]; int h = (int)userdata[1]; uint8_t *bufi1 = userdata[2]; int *crop = userdata[3]; hb_va_dxva2_t *dxva2 = userdata[4]; uint8_t *bufi2 = userdata[5]; int p = (int)userdata[6]; int decomb = (int)userdata[7]; int detelecine = (int)userdata[8]; int i; if( hb_init_nv12toyuv_ocl( kenv, w, h, dxva2 ) ) { return -1; } if( hb_nv12toyuv_setkernelarg( kenv, w, h, dxva2 ) ) { return -1; } int in_bytes = w*h*3/2; if( kenv->isAMD ) { void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_nv12, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, in_bytes, 0, NULL, NULL, NULL ); for ( i = 0; i < dxva2->height; i++ ) { memcpy( data + i * dxva2->width, bufi1 + i * p, dxva2->width ); if ( i < dxva2->height >> 1 ) { memcpy( data + ( dxva2->width * dxva2->height ) + i * dxva2->width, bufi2 + i * p, dxva2->width ); } } clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_nv12, data, 0, NULL, NULL ); } else { uint8_t *tmp = (uint8_t*)malloc( dxva2->width * dxva2->height * 3 / 2 ); for( i = 0; i < dxva2->height; i++ ) { memcpy( tmp + i * dxva2->width, bufi1 + i * p, dxva2->width ); if( i < dxva2->height >> 1 ) { memcpy( tmp + (dxva2->width * dxva2->height) + i * dxva2->width, bufi2 + i * p, dxva2->width ); } } OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_nv12, CL_TRUE, 0, in_bytes, tmp, 0, NULL, NULL ); free( tmp ); } size_t gdim[2] = {w>>1, h>>1}; OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); if( (crop[0] || crop[1] || crop[2] || crop[3]) && (decomb == 0) && (detelecine == 0) ) { AVPicture pic_in; AVPicture pic_crop; clEnqueueReadBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_out, 0, NULL, NULL ); hb_buffer_t *in = hb_video_buffer_init( w, h ); int wmp = in->plane[0].stride; int hmp = in->plane[0].height; copy_plane( in->plane[0].data, dxva2->nv12toyuv_tmp_out, wmp, w, hmp ); wmp = in->plane[1].stride; hmp = in->plane[1].height; copy_plane( in->plane[1].data, dxva2->nv12toyuv_tmp_out + w * h, wmp, w>>1, hmp ); wmp = in->plane[2].stride; hmp = in->plane[2].height; copy_plane( in->plane[2].data, dxva2->nv12toyuv_tmp_out + w * h +( ( w * h )>>2 ), wmp, w>>1, hmp ); hb_avpicture_fill( &pic_in, in ); av_picture_crop( &pic_crop, &pic_in, in->f.fmt, crop[0], crop[2] ); int i, ww = w - ( crop[2] + crop[3] ), hh = h - ( crop[0] + crop[1] ); for( i = 0; i< hh >> 1; i++ ) { memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 0 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], ww ); memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 1 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], ww ); memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + i * ( ww >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], ww >> 1 ); memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + ( ( ww * hh )>>2 ) + i * ( ww >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], ww >> 1 ); } if( kenv->isAMD ) { void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, ww * hh * 3 / 2, 0, NULL, NULL, NULL ); memcpy( data, dxva2->nv12toyuv_tmp_in, ww * hh * 3 / 2 ); clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_yuv, data, 0, NULL, NULL ); } else { OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_in, 0, NULL, NULL ); } hb_buffer_close( &in ); } return 0; } /** * register nv12 to yuv kernel. */ static int hb_nv12toyuv_reg_kernel( void ) { int st = hb_register_kernel_wrapper( "nv12toyuv", hb_nv12toyuv ); if( !st ) { hb_log( "OpenCL: register kernel[%s] failed", "nv12toyuv" ); return -1; } return 0; } /** * nv12 to yuv interface * bufi is input frame of nv12, w is input frame width, h is input frame height */ int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine ) { void *userdata[9]; userdata[0] = (void*)w; userdata[1] = (void*)h; userdata[2] = bufi[0]; userdata[3] = crop; userdata[4] = dxva2; userdata[5] = bufi[1]; userdata[6] = (void*)p; userdata[7] = decomb; userdata[8] = detelecine; if( hb_nv12toyuv_reg_kernel() ) { return -1; } if( hb_run_kernel( "nv12toyuv", userdata ) ) { hb_log( "OpenCL: run kernel[nv12toyuv] failed" ); return -1; } return 0; } #endif #endif