diff options
author | sr55 <[email protected]> | 2013-09-21 20:16:51 +0000 |
---|---|---|
committer | sr55 <[email protected]> | 2013-09-21 20:16:51 +0000 |
commit | f69b7f1dfc98c90d454078f1a3aabef3bae36fd2 (patch) | |
tree | 4792942c10f6b1c4418228bcc53b648b773d5098 /libhb | |
parent | 8bccfabca28d059978f1eb8e516592f4e2f06c1a (diff) |
Merging-in the OpenCL Scaling code from the OpenCL branch to trunk.
Patch originally by the Multicoreware Inc team, followed by improvements and fixes by Micheal Wootton from AMD Inc,
OpenCL:
This patch implements Bicubic Scaling in OpenCL.
Note that HandBrake currently uses Lanczos so the performance difference appears to be much more significant. We may offer an option of BiCubic in software later.
Bicubic scaling may appear a bit sharper than the equivalent Lanczos encode and may increase file size a bit. Quality may be better or worse depending on the scaling and content and personal preference towards sharpness.
When comparing performance with a custom HandBrake build that runs Software Bicubic to OpenCL Bicubic, performance increase is about 5~7% on average on a modern GPU.
Hardware Decode via DXVA:
We also have optional DXVA decoding which may come in useful for slower/lower end systems that have a capable GPU.
This is only available on input sources that use the libav decode path.
Most GPU hardware for decoding is designed for playback, so if you are running on a high end CPU, it will bottleneck the encode process.
Requires OpenCL 1.1 or later supporting GPU.
Front end changes and testing framework are not included in this patch. This will be resolved later.
Patch will be revised further before the UI is implemented.
git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@5792 b64f7644-9d1e-0410-96f1-a4d463321fa5
Diffstat (limited to 'libhb')
-rw-r--r-- | libhb/common.c | 13 | ||||
-rw-r--r-- | libhb/common.h | 70 | ||||
-rw-r--r-- | libhb/cropscale.c | 81 | ||||
-rw-r--r-- | libhb/deca52.c | 6 | ||||
-rw-r--r-- | libhb/decavcodec.c | 235 | ||||
-rw-r--r-- | libhb/detelecine.c | 2 | ||||
-rw-r--r-- | libhb/dxva2api.c | 38 | ||||
-rw-r--r-- | libhb/dxva2api.h | 822 | ||||
-rw-r--r-- | libhb/fifo.c | 83 | ||||
-rw-r--r-- | libhb/hb.c | 5 | ||||
-rw-r--r-- | libhb/hbffmpeg.h | 1 | ||||
-rw-r--r-- | libhb/internal.h | 9 | ||||
-rw-r--r-- | libhb/module.defs | 13 | ||||
-rw-r--r-- | libhb/oclnv12toyuv.c | 285 | ||||
-rw-r--r-- | libhb/oclnv12toyuv.h | 35 | ||||
-rw-r--r-- | libhb/oclscale.c | 271 | ||||
-rw-r--r-- | libhb/openclkernels.h | 771 | ||||
-rw-r--r-- | libhb/openclwrapper.c | 1261 | ||||
-rw-r--r-- | libhb/openclwrapper.h | 89 | ||||
-rw-r--r-- | libhb/scale.c | 1020 | ||||
-rw-r--r-- | libhb/scale.h | 324 | ||||
-rw-r--r-- | libhb/scale_kernel.c | 223 | ||||
-rw-r--r-- | libhb/scale_kernel.h | 20 | ||||
-rw-r--r-- | libhb/stream.c | 56 | ||||
-rw-r--r-- | libhb/vadxva2.c | 812 | ||||
-rw-r--r-- | libhb/vadxva2.h | 213 | ||||
-rw-r--r-- | libhb/work.c | 22 |
27 files changed, 6720 insertions, 60 deletions
diff --git a/libhb/common.c b/libhb/common.c index 18dd9fc53..3d5cf36fb 100644 --- a/libhb/common.c +++ b/libhb/common.c @@ -2829,6 +2829,7 @@ static void job_reset_for_mac_ui( hb_job_t * job, hb_title_t * title ) job->metadata = hb_metadata_copy( title->metadata ); } + static void job_setup( hb_job_t * job, hb_title_t * title ) { if ( job == NULL || title == NULL ) @@ -4129,3 +4130,15 @@ void hb_hexdump( hb_debug_level_t level, const char * label, const uint8_t * dat hb_deep_log( level, " %-50s%20s", line, ascii ); } } + +int hb_gui_use_hwd_flag = 0; +int hb_use_dxva( hb_title_t * title ) +{ + return ( (title->video_codec_param == AV_CODEC_ID_MPEG2VIDEO + || title->video_codec_param == AV_CODEC_ID_H264 + || title->video_codec_param == AV_CODEC_ID_VC1 + || title->video_codec_param == AV_CODEC_ID_WMV3 + || title->video_codec_param == AV_CODEC_ID_MPEG4 ) + && title->opaque_priv ); +} + diff --git a/libhb/common.h b/libhb/common.h index 19c1049cd..abc799ef1 100644 --- a/libhb/common.h +++ b/libhb/common.h @@ -20,7 +20,13 @@ #include <sys/types.h> #include <sys/stat.h> #include <dirent.h> - +#ifdef USE_OPENCL +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif +#endif /* * It seems WinXP doesn't align the stack of new threads to 16 bytes. * To prevent crashes in SSE functions, we need to force stack alignement @@ -154,6 +160,7 @@ int hb_subtitle_can_burn( int source ); int hb_subtitle_can_pass( int source, int mux ); hb_attachment_t *hb_attachment_copy(const hb_attachment_t *src); + hb_list_t *hb_attachment_list_copy(const hb_list_t *src); void hb_attachment_close(hb_attachment_t **attachment); @@ -353,6 +360,8 @@ struct hb_title_set_s int feature; // Detected DVD feature title }; +extern int hb_gui_use_hwd_flag; + /****************************************************************************** * hb_job_t: settings to be filled by the UI * Update win/CS/HandBrake.Interop/HandBrakeInterop/HbLib/hb_job_s.cs when changing this struct @@ -370,7 +379,7 @@ struct hb_job_s int chapter_start; int chapter_end; - /* Include chapter marker track in mp4? */ + /* Include chapter marker track in mp4? */ int chapter_markers; /* Picture settings: @@ -518,6 +527,10 @@ struct hb_job_s uint32_t frames_to_skip; // decode but discard this many frames // initially (for frame accurate positioning // to non-I frames). + int use_opencl;/* 0 is disable use of opencl. 1 is enable use of opencl */ + int use_hwd; + int use_decomb; + int use_detelecine; #ifdef USE_QSV // QSV-specific settings struct @@ -895,6 +908,8 @@ struct hb_title_s uint32_t flags; // set if video stream doesn't have IDR frames + int opencl_support; + int hwd_support; #define HBTF_NO_IDR (1 << 0) #define HBTF_SCAN_COMPLETE (1 << 0) }; @@ -1070,6 +1085,52 @@ extern hb_work_object_t hb_reader; #define HB_FILTER_DROP 3 #define HB_FILTER_DONE 4 +typedef struct hb_oclscale_s +{ +#ifdef USE_OPENCL + int initialized; + // bicubic scale weights + cl_mem bicubic_x_weights; + cl_mem bicubic_y_weights; + cl_float xscale; + cl_float yscale; + int width; + int height; + // horizontal scaling and vertical scaling kernel handle + cl_kernel m_kernel; + int use_ocl_mem; // 0 use host memory. 1 use gpu oclmem +#endif +} hb_oclscale_t; + +#ifdef USE_OPENCL +int hb_ocl_scale( hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os ); +#endif + +#ifdef USE_OPENCL +int hb_use_dxva( hb_title_t * title ); +// create opencl buffer +#define CREATEBUF( out, flags, size )\ + {\ + out = clCreateBuffer( kenv->context, (flags), (size), NULL, &status );\ + if( CL_SUCCESS != status ) return -1;\ + } + +#define OCLCHECK( method, ... )\ + status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\ + printf( # method " error '%d'\n", status ); return status; } + +#define CL_FREE( buf )\ +{\ + if( buf )\ + {\ + { clReleaseMemObject( buf ); }\ + buf = NULL;\ + }\ +} + + +#endif + typedef struct hb_filter_init_s { hb_job_t * job; @@ -1082,6 +1143,10 @@ typedef struct hb_filter_init_s int vrate_base; int vrate; int cfr; +#ifdef USE_OPENCL + int use_dxva; +#endif + } hb_filter_init_t; typedef struct hb_filter_info_s @@ -1140,6 +1205,7 @@ enum HB_FILTER_DENOISE, HB_FILTER_RENDER_SUB, HB_FILTER_CROP_SCALE, + // Finally filters that don't care what order they are in, // except that they must be after the above filters HB_FILTER_ROTATE, diff --git a/libhb/cropscale.c b/libhb/cropscale.c index d217393d1..c7d7d9948 100644 --- a/libhb/cropscale.c +++ b/libhb/cropscale.c @@ -9,9 +9,12 @@ #include "hb.h" #include "hbffmpeg.h" +#include "common.h" + struct hb_filter_private_s { + hb_job_t *job; int width_in; int height_in; int pix_fmt; @@ -19,6 +22,13 @@ struct hb_filter_private_s int width_out; int height_out; int crop[4]; + +#ifdef USE_OPENCL + int use_dxva; + int use_decomb; + int use_detelecine; + hb_oclscale_t *os; //ocl scaler handler +#endif struct SwsContext * context; }; @@ -53,11 +63,23 @@ static int hb_crop_scale_init( hb_filter_object_t * filter, hb_filter_private_t * pv = filter->private_data; // TODO: add pix format option to settings + pv->job = init->job; pv->pix_fmt_out = init->pix_fmt; pv->width_in = init->width; pv->height_in = init->height; pv->width_out = init->width - (init->crop[2] + init->crop[3]); pv->height_out = init->height - (init->crop[0] + init->crop[1]); +#ifdef USE_OPENCL + pv->use_dxva = init->use_dxva; + pv->use_decomb = init->job->use_decomb; + pv->use_detelecine = init->job->use_detelecine; + + if( pv->job->use_opencl ) + { + pv->os = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); + memset( pv->os, 0, sizeof( hb_oclscale_t ) ); + } +#endif memcpy( pv->crop, init->crop, sizeof( int[4] ) ); if( filter->settings ) { @@ -71,6 +93,9 @@ static int hb_crop_scale_init( hb_filter_object_t * filter, init->width = pv->width_out; init->height = pv->height_out; memcpy( init->crop, pv->crop, sizeof( int[4] ) ); +#ifdef USE_OPENCL + pv->use_dxva = init->use_dxva; +#endif return 0; } @@ -111,8 +136,16 @@ static void hb_crop_scale_close( hb_filter_object_t * filter ) { return; } +#ifdef USE_OPENCL - if ( pv->context ) + if( pv->job->use_opencl && pv->os ) + { + CL_FREE( pv->os->bicubic_x_weights ); + CL_FREE( pv->os->bicubic_y_weights ); + free( pv->os ); + } +#endif + if( pv->context ) { sws_freeContext( pv->context ); } @@ -121,6 +154,25 @@ static void hb_crop_scale_close( hb_filter_object_t * filter ) filter->private_data = NULL; } +#ifdef USE_OPENCL +static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, int h ) +{ + if( dstride == sstride ) + { + memcpy( dst, src, dstride * h ); + return dst + dstride * h; + } + int lbytes = dstride <= sstride ? dstride : sstride; + while( --h >= 0 ) + { + memcpy( dst, src, lbytes ); + src += sstride; + dst += dstride; + } + return dst; +} +#endif + static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) { AVPicture pic_in; @@ -137,6 +189,15 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) av_picture_crop( &pic_crop, &pic_in, in->f.fmt, pv->crop[0], pv->crop[2] ); +#ifdef USE_OPENCL + // Use bicubic OpenCL scaling when selected and when downsampling < 4:1; + if ((pv->job->use_opencl) && (pv->width_out * 4 > pv->width_in) && (in->cl.buffer != NULL) && (out->cl.buffer != NULL)) + { + hb_ocl_scale(in, out, pv->crop, pv->os); + } + else + { +#endif if ( !pv->context || pv->width_in != in->f.width || pv->height_in != in->f.height || @@ -164,7 +225,9 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) pic_crop.linesize, 0, in->f.height - (pv->crop[0] + pv->crop[1]), pic_out.data, pic_out.linesize); - +#ifdef USE_OPENCL + } +#endif out->s = in->s; hb_buffer_move_subs( out, in ); return out; @@ -198,6 +261,18 @@ static int hb_crop_scale_work( hb_filter_object_t * filter, pv->width_out = in->f.width - (pv->crop[2] + pv->crop[3]); pv->height_out = in->f.height - (pv->crop[0] + pv->crop[1]); } +#ifdef USE_OPENCL + if ( (in->f.fmt == pv->pix_fmt_out && + !pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] && + in->f.width == pv->width_out && in->f.height == pv->height_out) && + (pv->use_decomb == 0) && (pv->use_detelecine == 0) || + (pv->use_dxva && in->f.width == pv->width_out && in->f.height == pv->height_out) ) + { + *buf_out = in; + *buf_in = NULL; + return HB_FILTER_OK; + } +#else if ( in->f.fmt == pv->pix_fmt_out && !pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] && in->f.width == pv->width_out && in->f.height == pv->height_out ) @@ -206,6 +281,8 @@ static int hb_crop_scale_work( hb_filter_object_t * filter, *buf_in = NULL; return HB_FILTER_OK; } +#endif + *buf_out = crop_scale( pv, in ); return HB_FILTER_OK; diff --git a/libhb/deca52.c b/libhb/deca52.c index 3c78d157c..f5dac1f96 100644 --- a/libhb/deca52.c +++ b/libhb/deca52.c @@ -336,7 +336,11 @@ static hb_buffer_t* Decode(hb_work_object_t *w) } double frame_dur = (6. * 256. * 90000.) / pv->rate; - double pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts; + double pts; + if (hb_gui_use_hwd_flag == 1 && ipts != -1) + pts = ((double)ipts >= pv->next_expected_pts) ? (double)ipts : pv->next_expected_pts; + else + pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts; /* AC3 passthrough: don't decode the AC3 frame */ if (audio->config.out.codec == HB_ACODEC_AC3_PASS) diff --git a/libhb/decavcodec.c b/libhb/decavcodec.c index 0e760b8db..84ad5103c 100644 --- a/libhb/decavcodec.c +++ b/libhb/decavcodec.c @@ -42,6 +42,10 @@ #include "hbffmpeg.h" #include "audio_resample.h" +#ifdef USE_HWD +#include "vadxva2.h" +#endif + #ifdef USE_QSV #include "enc_qsv.h" #include "qsv_common.h" @@ -104,7 +108,11 @@ struct hb_work_private_s int sws_pix_fmt; int cadence[12]; int wait_for_keyframe; - +#ifdef USE_HWD + hb_va_dxva2_t *dxva2; + uint8_t *dst_frame; + hb_oclscale_t *opencl_scale; +#endif hb_audio_resample_t *resample; #ifdef USE_QSV @@ -381,6 +389,22 @@ static void closePrivData( hb_work_private_t ** ppv ) hb_list_empty( &pv->list ); } hb_audio_resample_free(pv->resample); + +#ifdef USE_HWD + if ( pv->opencl_scale ) + { + free( pv->opencl_scale ); + } + + if ( pv->dxva2 ) + { +#ifdef USE_OPENCL + CL_FREE( pv->dxva2->cl_mem_nv12 ); +#endif + hb_va_close( pv->dxva2 ); + } +#endif + #ifdef USE_QSV_PTS_WORKAROUND if (pv->qsv.decode && pv->qsv.pts_list != NULL) @@ -394,6 +418,7 @@ static void closePrivData( hb_work_private_t ** ppv ) hb_list_close(&pv->qsv.pts_list); } #endif + free( pv ); } *ppv = NULL; @@ -402,7 +427,9 @@ static void closePrivData( hb_work_private_t ** ppv ) static void decavcodecClose( hb_work_object_t * w ) { hb_work_private_t * pv = w->private_data; - +#ifdef USE_HWD + if( pv->dst_frame ) free( pv->dst_frame ); +#endif if ( pv ) { closePrivData( &pv ); @@ -686,11 +713,47 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame ) } else { - w = pv->job->title->width; - h = pv->job->title->height; + w = pv->job->title->width; + h = pv->job->title->height; } - hb_buffer_t *buf = hb_video_buffer_init( w, h ); +#ifdef USE_HWD + if (pv->dxva2 && pv->job) + { + hb_buffer_t *buf; + int ww, hh; + + buf = hb_video_buffer_init( w, h ); + ww = w; + hh = h; + + if( !pv->dst_frame ) + { + pv->dst_frame = malloc( ww * hh * 3 / 2 ); + } + if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->opencl_scale, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR ) + { + hb_log( "hb_va_Extract failed!!!!!!" ); + } + w = buf->plane[0].stride; + h = buf->plane[0].height; + uint8_t *dst = buf->plane[0].data; + copy_plane( dst, pv->dst_frame, w, ww, h ); + w = buf->plane[1].stride; + h = buf->plane[1].height; + dst = buf->plane[1].data; + copy_plane( dst, pv->dst_frame + ww * hh, w, ww >> 1, h ); + w = buf->plane[2].stride; + h = buf->plane[2].height; + dst = buf->plane[2].data; + copy_plane( dst, pv->dst_frame + ww * hh +( ( ww * hh ) >> 2 ), w, ww >> 1, h ); + return buf; + } + else +#endif + { + hb_buffer_t *buf = hb_video_buffer_init( w, h ); + #ifdef USE_QSV // no need to copy the frame data when decoding with QSV to opaque memory if (pv->qsv.decode && @@ -701,52 +764,90 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame ) } #endif - uint8_t *dst = buf->data; + uint8_t *dst = buf->data; - if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width || - h != context->height) - { - // have to convert to our internal color space and/or rescale - AVPicture dstpic; - hb_avpicture_fill(&dstpic, buf); - - if (pv->sws_context == NULL || - pv->sws_width != context->width || - pv->sws_height != context->height || - pv->sws_pix_fmt != context->pix_fmt) + if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width || + h != context->height) + { + // have to convert to our internal color space and/or rescale + AVPicture dstpic; + hb_avpicture_fill(&dstpic, buf); + + if (pv->sws_context == NULL || + pv->sws_width != context->width || + pv->sws_height != context->height || + pv->sws_pix_fmt != context->pix_fmt) + { + if (pv->sws_context != NULL) + sws_freeContext(pv->sws_context); + pv->sws_context = hb_sws_get_context(context->width, + context->height, + context->pix_fmt, + w, h, AV_PIX_FMT_YUV420P, + SWS_LANCZOS|SWS_ACCURATE_RND); + pv->sws_width = context->width; + pv->sws_height = context->height; + pv->sws_pix_fmt = context->pix_fmt; + } + sws_scale(pv->sws_context, + (const uint8_t* const *)frame->data, frame->linesize, + 0, context->height, dstpic.data, dstpic.linesize); + } + else { - if (pv->sws_context != NULL) - sws_freeContext(pv->sws_context); - pv->sws_context = hb_sws_get_context(context->width, - context->height, - context->pix_fmt, - w, h, AV_PIX_FMT_YUV420P, - SWS_LANCZOS|SWS_ACCURATE_RND); - pv->sws_width = context->width; - pv->sws_height = context->height; - pv->sws_pix_fmt = context->pix_fmt; + w = buf->plane[0].stride; + h = buf->plane[0].height; + dst = buf->plane[0].data; + copy_plane( dst, frame->data[0], w, frame->linesize[0], h ); + w = buf->plane[1].stride; + h = buf->plane[1].height; + dst = buf->plane[1].data; + copy_plane( dst, frame->data[1], w, frame->linesize[1], h ); + w = buf->plane[2].stride; + h = buf->plane[2].height; + dst = buf->plane[2].data; + copy_plane( dst, frame->data[2], w, frame->linesize[2], h ); } - sws_scale(pv->sws_context, - (const uint8_t* const *)frame->data, frame->linesize, - 0, context->height, dstpic.data, dstpic.linesize); + return buf; + } +} + +#ifdef USE_HWD + +static int get_frame_buf_hwd( AVCodecContext *context, AVFrame *frame ) +{ + + hb_work_private_t *pv = (hb_work_private_t*)context->opaque; + if ( (pv != NULL) && pv->dxva2 ) + { + int result = HB_WORK_ERROR; + hb_work_private_t *pv = (hb_work_private_t*)context->opaque; + result = hb_va_get_frame_buf( pv->dxva2, context, frame ); + if( result == HB_WORK_ERROR ) + return avcodec_default_get_buffer( context, frame ); + return 0; } else + return avcodec_default_get_buffer( context, frame ); +} + +static void hb_ffmpeg_release_frame_buf( struct AVCodecContext *p_context, AVFrame *frame ) +{ + hb_work_private_t *p_dec = (hb_work_private_t*)p_context->opaque; + int i; + if( p_dec->dxva2 ) { - w = buf->plane[0].stride; - h = buf->plane[0].height; - dst = buf->plane[0].data; - copy_plane( dst, frame->data[0], w, frame->linesize[0], h ); - w = buf->plane[1].stride; - h = buf->plane[1].height; - dst = buf->plane[1].data; - copy_plane( dst, frame->data[1], w, frame->linesize[1], h ); - w = buf->plane[2].stride; - h = buf->plane[2].height; - dst = buf->plane[2].data; - copy_plane( dst, frame->data[2], w, frame->linesize[2], h ); + hb_va_release( p_dec->dxva2, frame ); + } + else if( !frame->opaque ) + { + if( frame->type == FF_BUFFER_TYPE_INTERNAL ) + avcodec_default_release_buffer( p_context, frame ); } - return buf; + for( i = 0; i < 4; i++ ) + frame->data[i] = NULL; } +#endif static void log_chapter( hb_work_private_t *pv, int chap_num, int64_t pts ) { @@ -979,16 +1080,27 @@ static int decodeFrame( hb_work_object_t *w, uint8_t *data, int size, int sequen if ( !pv->frame_duration_set ) compute_frame_duration( pv ); + double pts; double frame_dur = pv->duration; if ( frame.repeat_pict ) { frame_dur += frame.repeat_pict * pv->field_duration; } - +#ifdef USE_HWD + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + if( avp.pts>0 ) + { + if( pv->dxva2->input_pts[0] != 0 && pv->dxva2->input_pts[1] == 0 ) + frame.pkt_pts = pv->dxva2->input_pts[0]; + else + frame.pkt_pts = pv->dxva2->input_pts[0]<pv->dxva2->input_pts[1] ? pv->dxva2->input_pts[0] : pv->dxva2->input_pts[1]; + } + } +#endif // If there was no pts for this frame, assume constant frame rate // video & estimate the next frame time from the last & duration. - double pts; - if (frame.pkt_pts == AV_NOPTS_VALUE) + if (frame.pkt_pts == AV_NOPTS_VALUE || hb_gui_use_hwd_flag == 1) { pts = pv->pts_next; } @@ -996,6 +1108,7 @@ static int decodeFrame( hb_work_object_t *w, uint8_t *data, int size, int sequen { pts = frame.pkt_pts; } + pv->pts_next = pts + frame_dur; if ( frame.top_field_first ) @@ -1252,6 +1365,25 @@ static int decavcodecvInit( hb_work_object_t * w, hb_job_t * job ) pv->context->workaround_bugs = FF_BUG_AUTODETECT; pv->context->err_recognition = AV_EF_CRCCHECK; pv->context->error_concealment = FF_EC_GUESS_MVS|FF_EC_DEBLOCK; +#ifdef USE_HWD + if ( pv->job && job->use_hwd && hb_use_dxva( pv->title ) ) + { + pv->dxva2 = hb_va_create_dxva2( pv->dxva2, w->codec_param ); + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + hb_va_new_dxva2( pv->dxva2, pv->context ); + pv->context->slice_flags |= SLICE_FLAG_ALLOW_FIELD; + pv->context->opaque = pv; + pv->context->get_buffer = get_frame_buf_hwd; + pv->context->release_buffer = hb_ffmpeg_release_frame_buf; + pv->context->get_format = hb_ffmpeg_get_format; + pv->opencl_scale = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); + memset( pv->opencl_scale, 0, sizeof( hb_oclscale_t ) ); + pv->threads = 1; + } + } +#endif + #ifdef USE_QSV if (pv->qsv.decode) @@ -1488,6 +1620,16 @@ static int decavcodecvWork( hb_work_object_t * w, hb_buffer_t ** buf_in, pv->new_chap = in->s.new_chap; pv->chap_time = pts >= 0? pts : pv->pts_next; } +#ifdef USE_HWD + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + if( pv->dxva2->input_pts[0] <= pv->dxva2->input_pts[1] ) + pv->dxva2->input_pts[0] = pts; + else if( pv->dxva2->input_pts[0] > pv->dxva2->input_pts[1] ) + pv->dxva2->input_pts[1] = pts; + pv->dxva2->input_dts = dts; + } +#endif decodeVideo( w, in->data, in->size, in->sequence, pts, dts, in->s.frametype ); hb_buffer_close( &in ); *buf_out = link_buf_list( pv ); @@ -1746,7 +1888,6 @@ hb_work_object_t hb_decavcodecv = .info = decavcodecvInfo, .bsinfo = decavcodecvBSInfo }; - static void decodeAudio(hb_audio_t *audio, hb_work_private_t *pv, uint8_t *data, int size, int64_t pts) { diff --git a/libhb/detelecine.c b/libhb/detelecine.c index 91c97e878..2fdc48038 100644 --- a/libhb/detelecine.c +++ b/libhb/detelecine.c @@ -485,7 +485,7 @@ static void pullup_print_aff_and_breaks(struct pullup_context * c, int i; struct pullup_field * f0 = f; const char aff_l[] = "+..", aff_r[] = "..+"; - printf( "\naffinity: " ); + hb_log( "affinity: " ); for( i = 0; i < 4; i++ ) { printf( "%c%d%c", diff --git a/libhb/dxva2api.c b/libhb/dxva2api.c new file mode 100644 index 000000000..004cd681a --- /dev/null +++ b/libhb/dxva2api.c @@ -0,0 +1,38 @@ +/* dxva2api.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ +#ifdef USE_HWD +#include "dxva2api.h" + +__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ ) +{ + return (FLOAT)_fixed_.Value + (FLOAT)_fixed_.Fraction / 0x10000; +} + +__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha() +{ + DXVA2_Fixed32 _fixed_; + _fixed_.Fraction = 0; + _fixed_.Value = 0; + _fixed_.ll = 1; + return _fixed_; +} + + +__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ ) +{ + DXVA2_Fixed32 _fixed_; + _fixed_.Fraction = LOWORD( _float_ * 0x10000 ); + _fixed_.Value = HIWORD( _float_ * 0x10000 ); + return _fixed_; +} +#endif diff --git a/libhb/dxva2api.h b/libhb/dxva2api.h new file mode 100644 index 000000000..915fc0085 --- /dev/null +++ b/libhb/dxva2api.h @@ -0,0 +1,822 @@ +/* dxva2api.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + + +#ifndef _DXVA2API_H +#define _DXVA2API_H +#ifdef USE_HWD +#define MINGW_DXVA2API_H_VERSION (2) + +#if __GNUC__ >= 3 +#pragma GCC system_header +#endif + +#include <objbase.h> +#include <d3d9.h> + +/* Define it to allow using nameless struct/union (non C99 compliant) to match + * the official documentation. */ +//#define DXVA2API_USE_BITFIELDS + +/****************STRUCTURES******************/ +#pragma pack(push, 1) + +#define DXVA2API_USE_BITFIELDS + +typedef struct _DXVA2_ExtendedFormat { +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + UINT SampleFormat : 8; + UINT VideoChromaSubsampling : 4; + UINT NominalRange : 3; + UINT VideoTransferMatrix : 3; + UINT VideoLighting : 4; + UINT VideoPrimaries : 5; + UINT VideoTransferFunction : 5; + }; + UINT value; + }; +#else + UINT value; +#endif +} DXVA2_ExtendedFormat; + +typedef struct _DXVA2_Frequency { + UINT Numerator; + UINT Denominator; +} DXVA2_Frequency; + +typedef struct _DXVA2_VideoDesc { + UINT SampleWidth; + UINT SampleHeight; + DXVA2_ExtendedFormat SampleFormat; + D3DFORMAT Format; + DXVA2_Frequency InputSampleFreq; + DXVA2_Frequency OutputFrameFreq; + UINT UABProtectionLevel; + UINT Reserved; +} DXVA2_VideoDesc; + +typedef struct _DXVA2_ConfigPictureDecode { + GUID guidConfigBitstreamEncryption; + GUID guidConfigMBcontrolEncryption; + GUID guidConfigResidDiffEncryption; + UINT ConfigBitstreamRaw; + UINT ConfigMBcontrolRasterOrder; + UINT ConfigResidDiffHost; + UINT ConfigSpatialResid8; + UINT ConfigResid8Subtraction; + UINT ConfigSpatialHost8or9Clipping; + UINT ConfigSpatialResidInterleaved; + UINT ConfigIntraResidUnsigned; + UINT ConfigResidDiffAccelerator; + UINT ConfigHostInverseScan; + UINT ConfigSpecificIDCT; + UINT Config4GroupedCoefs; + USHORT ConfigMinRenderTargetBuffCount; + USHORT ConfigDecoderSpecific; +} DXVA2_ConfigPictureDecode; + +typedef struct _DXVA2_DecodeBufferDesc { + DWORD CompressedBufferType; + UINT BufferIndex; + UINT DataOffset; + UINT DataSize; + UINT FirstMBaddress; + UINT NumMBsInBuffer; + UINT Width; + UINT Height; + UINT Stride; + UINT ReservedBits; + PVOID pvPVPState; +} DXVA2_DecodeBufferDesc; + +typedef struct _DXVA2_DecodeExtensionData { + UINT Function; + PVOID pPrivateInputData; + UINT PrivateInputDataSize; + PVOID pPrivateOutputData; + UINT PrivateOutputDataSize; +} DXVA2_DecodeExtensionData; + +typedef struct _DXVA2_DecodeExecuteParams { + UINT NumCompBuffers; + DXVA2_DecodeBufferDesc *pCompressedBuffers; + DXVA2_DecodeExtensionData *pExtensionData; +} DXVA2_DecodeExecuteParams; + +enum { + DXVA2_VideoDecoderRenderTarget = 0, + DXVA2_VideoProcessorRenderTarget= 1, + DXVA2_VideoSoftwareRenderTarget = 2 +}; + +enum { + DXVA2_PictureParametersBufferType = 0, + DXVA2_MacroBlockControlBufferType = 1, + DXVA2_ResidualDifferenceBufferType = 2, + DXVA2_DeblockingControlBufferType = 3, + DXVA2_InverseQuantizationMatrixBufferType = 4, + DXVA2_SliceControlBufferType = 5, + DXVA2_BitStreamDateBufferType = 6, + DXVA2_MotionVectorBuffer = 7, + DXVA2_FilmGrainBuffer = 8 +}; + +/* DXVA MPEG-I/II and VC-1 */ +typedef struct _DXVA_PictureParameters { + USHORT wDecodedPictureIndex; + USHORT wDeblockedPictureIndex; + USHORT wForwardRefPictureIndex; + USHORT wBackwardRefPictureIndex; + USHORT wPicWidthInMBminus1; + USHORT wPicHeightInMBminus1; + UCHAR bMacroblockWidthMinus1; + UCHAR bMacroblockHeightMinus1; + UCHAR bBlockWidthMinus1; + UCHAR bBlockHeightMinus1; + UCHAR bBPPminus1; + UCHAR bPicStructure; + UCHAR bSecondField; + UCHAR bPicIntra; + UCHAR bPicBackwardPrediction; + UCHAR bBidirectionalAveragingMode; + UCHAR bMVprecisionAndChromaRelation; + UCHAR bChromaFormat; + UCHAR bPicScanFixed; + UCHAR bPicScanMethod; + UCHAR bPicReadbackRequests; + UCHAR bRcontrol; + UCHAR bPicSpatialResid8; + UCHAR bPicOverflowBlocks; + UCHAR bPicExtrapolation; + UCHAR bPicDeblocked; + UCHAR bPicDeblockConfined; + UCHAR bPic4MVallowed; + UCHAR bPicOBMC; + UCHAR bPicBinPB; + UCHAR bMV_RPS; + UCHAR bReservedBits; + USHORT wBitstreamFcodes; + USHORT wBitstreamPCEelements; + UCHAR bBitstreamConcealmentNeed; + UCHAR bBitstreamConcealmentMethod; +} DXVA_PictureParameters, *LPDXVA_PictureParameters; + +typedef struct _DXVA_QmatrixData { + BYTE bNewQmatrix[4]; + WORD Qmatrix[4][8 * 8]; +} DXVA_QmatrixData, *LPDXVA_QmatrixData; + +typedef struct _DXVA_SliceInfo { + USHORT wHorizontalPosition; + USHORT wVerticalPosition; + UINT dwSliceBitsInBuffer; + UINT dwSliceDataLocation; + UCHAR bStartCodeBitOffset; + UCHAR bReservedBits; + USHORT wMBbitOffset; + USHORT wNumberMBsInSlice; + USHORT wQuantizerScaleCode; + USHORT wBadSliceChopping; +} DXVA_SliceInfo, *LPDXVA_SliceInfo; + +/* DXVA H264 */ +typedef struct { +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + UCHAR Index7Bits : 7; + UCHAR AssociatedFlag : 1; + }; + UCHAR bPicEntry; + }; +#else + UCHAR bPicEntry; +#endif +} DXVA_PicEntry_H264; + + +typedef struct { + USHORT wFrameWidthInMbsMinus1; + USHORT wFrameHeightInMbsMinus1; + DXVA_PicEntry_H264 CurrPic; + UCHAR num_ref_frames; +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + USHORT field_pic_flag : 1; + USHORT MbaffFrameFlag : 1; + USHORT residual_colour_transform_flag : 1; + USHORT sp_for_switch_flag : 1; + USHORT chroma_format_idc : 2; + USHORT RefPicFlag : 1; + USHORT constrained_intra_pred_flag : 1; + USHORT weighted_pred_flag : 1; + USHORT weighted_bipred_idc : 2; + USHORT MbsConsecutiveFlag : 1; + USHORT frame_mbs_only_flag : 1; + USHORT transform_8x8_mode_flag : 1; + USHORT MinLumaBipredSize8x8Flag : 1; + USHORT IntraPicFlag : 1; + }; + USHORT wBitFields; + }; +#else + USHORT wBitFields; +#endif + UCHAR bit_depth_luma_minus8; + UCHAR bit_depth_chroma_minus8; + USHORT Reserved16Bits; + UINT StatusReportFeedbackNumber; + DXVA_PicEntry_H264 RefFrameList[16]; + INT CurrFieldOrderCnt[2]; + INT FieldOrderCntList[16][2]; + CHAR pic_init_qs_minus26; + CHAR chroma_qp_index_offset; + CHAR second_chroma_qp_index_offset; + UCHAR ContinuationFlag; + CHAR pic_init_qp_minus26; + UCHAR num_ref_idx_l0_active_minus1; + UCHAR num_ref_idx_l1_active_minus1; + UCHAR Reserved8BitsA; + USHORT FrameNumList[16]; + + UINT UsedForReferenceFlags; + USHORT NonExistingFrameFlags; + USHORT frame_num; + UCHAR log2_max_frame_num_minus4; + UCHAR pic_order_cnt_type; + UCHAR log2_max_pic_order_cnt_lsb_minus4; + UCHAR delta_pic_order_always_zero_flag; + UCHAR direct_8x8_inference_flag; + UCHAR entropy_coding_mode_flag; + UCHAR pic_order_present_flag; + UCHAR num_slice_groups_minus1; + UCHAR slice_group_map_type; + UCHAR deblocking_filter_control_present_flag; + UCHAR redundant_pic_cnt_present_flag; + UCHAR Reserved8BitsB; + USHORT slice_group_change_rate_minus1; + UCHAR SliceGroupMap[810]; +} DXVA_PicParams_H264; + +typedef struct { + UCHAR bScalingLists4x4[6][16]; + UCHAR bScalingLists8x8[2][64]; +} DXVA_Qmatrix_H264; + + +typedef struct { + UINT BSNALunitDataLocation; + UINT SliceBytesInBuffer; + USHORT wBadSliceChopping; + USHORT first_mb_in_slice; + USHORT NumMbsForSlice; + USHORT BitOffsetToSliceData; + UCHAR slice_type; + UCHAR luma_log2_weight_denom; + UCHAR chroma_log2_weight_denom; + + UCHAR num_ref_idx_l0_active_minus1; + UCHAR num_ref_idx_l1_active_minus1; + CHAR slice_alpha_c0_offset_div2; + CHAR slice_beta_offset_div2; + UCHAR Reserved8Bits; + DXVA_PicEntry_H264 RefPicList[2][32]; + SHORT Weights[2][32][3][2]; + CHAR slice_qs_delta; + CHAR slice_qp_delta; + UCHAR redundant_pic_cnt; + UCHAR direct_spatial_mv_pred_flag; + UCHAR cabac_init_idc; + UCHAR disable_deblocking_filter_idc; + USHORT slice_id; +} DXVA_Slice_H264_Long; + +typedef struct { + UINT BSNALunitDataLocation; + UINT SliceBytesInBuffer; + USHORT wBadSliceChopping; +} DXVA_Slice_H264_Short; + +typedef struct { + USHORT wFrameWidthInMbsMinus1; + USHORT wFrameHeightInMbsMinus1; + DXVA_PicEntry_H264 InPic; + DXVA_PicEntry_H264 OutPic; + USHORT PicOrderCnt_offset; + INT CurrPicOrderCnt; + UINT StatusReportFeedbackNumber; + UCHAR model_id; + UCHAR separate_colour_description_present_flag; + UCHAR film_grain_bit_depth_luma_minus8; + UCHAR film_grain_bit_depth_chroma_minus8; + UCHAR film_grain_full_range_flag; + UCHAR film_grain_colour_primaries; + UCHAR film_grain_transfer_characteristics; + UCHAR film_grain_matrix_coefficients; + UCHAR blending_mode_id; + UCHAR log2_scale_factor; + UCHAR comp_model_present_flag[4]; + UCHAR num_intensity_intervals_minus1[4]; + UCHAR num_model_values_minus1[4]; + UCHAR intensity_interval_lower_bound[3][16]; + UCHAR intensity_interval_upper_bound[3][16]; + SHORT comp_model_value[3][16][8]; +} DXVA_FilmGrainChar_H264; + +typedef struct { + union { + struct { + USHORT Fraction; + SHORT Value; + }; + LONG ll; + }; +}DXVA2_Fixed32; + +typedef struct { + UCHAR Cr; + UCHAR Cb; + UCHAR Y; + UCHAR Alpha; +}DXVA2_AYUVSample8; + +typedef struct { + USHORT Cr; + USHORT Cb; + USHORT Y; + USHORT Alpha; +}DXVA2_AYUVSample16; + +typedef struct { + DXVA2_Fixed32 MinValue; + DXVA2_Fixed32 MaxValue; + DXVA2_Fixed32 DefaultValue; + DXVA2_Fixed32 StepSize; +}DXVA2_ValueRange; + +typedef struct { + DXVA2_Fixed32 Brightness; + DXVA2_Fixed32 Contrast; + DXVA2_Fixed32 Hue; + DXVA2_Fixed32 Saturation; +}DXVA2_ProcAmpValues; + +typedef struct { + DXVA2_Fixed32 Level; + DXVA2_Fixed32 Threshold; + DXVA2_Fixed32 Radius; +}DXVA2_FilterValues; + +typedef struct { + UINT DeviceCaps; + D3DPOOL InputPool; + UINT NumForwardRefSamples; + UINT NumBackwardRefSamples; + UINT Reserved; + UINT DeinterlaceTechnology; + UINT ProcAmpControlCaps; + UINT VideoProcessorOperations; + UINT NoiseFilterTechnology; + UINT DetailFilterTechnology; +}DXVA2_VideoProcessorCaps; + +#ifndef _REFERENCE_TIME_ +#define _REFERENCE_TIME_ +typedef long long int64_t; +typedef int64_t REFERENCE_TIME; +#endif + +typedef struct { + REFERENCE_TIME Start; + REFERENCE_TIME End; + DXVA2_ExtendedFormat SampleFormat; + IDirect3DSurface9 *SrcSurface; + RECT SrcRect; + RECT DstRect; + DXVA2_AYUVSample8 Pal[16]; + DXVA2_Fixed32 PlanarAlpha; + DWORD SampleData; +}DXVA2_VideoSample; + + +typedef struct { + REFERENCE_TIME TargetFrame; + RECT TargetRect; + SIZE ConstrictionSize; + UINT StreamingFlags; + DXVA2_AYUVSample16 BackgroundColor; + DXVA2_ExtendedFormat DestFormat; + DXVA2_ProcAmpValues ProcAmpValues; + DXVA2_Fixed32 Alpha; + DXVA2_FilterValues NoiseFilterLuma; + DXVA2_FilterValues NoiseFilterChroma; + DXVA2_FilterValues DetailFilterLuma; + DXVA2_FilterValues DetailFilterChroma; + DWORD DestData; +} DXVA2_VideoProcessBltParams; + +#pragma pack(pop) + +/*************INTERFACES************/ +#ifdef __cplusplus +extern "C" { +#endif +#define _COM_interface struct +typedef _COM_interface IDirectXVideoDecoderService IDirectXVideoDecoderService; +typedef _COM_interface IDirectXVideoDecoder IDirectXVideoDecoder; + +#undef INTERFACE +#define INTERFACE IDirectXVideoDecoder +DECLARE_INTERFACE_( IDirectXVideoDecoder, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( GetVideoDecoderService ) ( THIS_ IDirectXVideoDecoderService** ) PURE; + STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9***, UINT* ) PURE; + STDMETHOD( GetBuffer ) ( THIS_ UINT, void**, UINT* ) PURE; + STDMETHOD( ReleaseBuffer ) ( THIS_ UINT ) PURE; + STDMETHOD( BeginFrame ) ( THIS_ IDirect3DSurface9 *, void* ) PURE; + STDMETHOD( EndFrame ) ( THIS_ HANDLE * ) PURE; + STDMETHOD( Execute ) ( THIS_ const DXVA2_DecodeExecuteParams* ) PURE; + + +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoDecoder_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoDecoder_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->lpVtbl->BeginFrame( p, a, b ) +#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->lpVtbl->EndFrame( p, a ) +#define IDirectXVideoDecoder_Execute( p, a ) (p)->lpVtbl->Execute( p, a ) +#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->lpVtbl->GetBuffer( p, a, b, c ) +#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d, e ) +#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->lpVtbl->GetVideoDecoderService( p, a ) +#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->lpVtbl->ReleaseBuffer( p, a ) +#else +#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoDecoder_AddRef( p ) (p)->AddRef() +#define IDirectXVideoDecoder_Release( p ) (p)->Release() +#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->BeginFrame( a, b ) +#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->EndFrame( a ) +#define IDirectXVideoDecoder_Execute( p, a ) (p)->Execute( a ) +#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->GetBuffer( a, b, c ) +#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->GetCreationParameters( a, b, c, d, e ) +#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->GetVideoDecoderService( a ) +#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->ReleaseBuffer( a ) +#endif + +#undef INTERFACE +#define INTERFACE IDirectXVideoAccelerationService +DECLARE_INTERFACE_( IDirectXVideoAccelerationService, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoAccelerationService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoAccelerationService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#else +#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoAccelerationService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoAccelerationService_Release( p ) (p)->Release() +#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#endif + +#undef INTERFACE +#define INTERFACE IDirectXVideoDecoderService +DECLARE_INTERFACE_( IDirectXVideoDecoderService, IDirectXVideoAccelerationService ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + STDMETHOD( GetDecoderDeviceGuids ) ( THIS_ UINT*, GUID ** ) PURE; + STDMETHOD( GetDecoderRenderTargets ) ( THIS_ REFGUID, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetDecoderConfigurations ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, IUnknown*, UINT*, DXVA2_ConfigPictureDecode** ) PURE; + STDMETHOD( CreateVideoDecoder ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9**, UINT, IDirectXVideoDecoder** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoDecoderService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoDecoderService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->lpVtbl->CreateVideoDecoder( p, a, b, c, d, e, f ) +#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->lpVtbl->GetDecoderConfigurations( p, a, b, c, d, e ) +#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->lpVtbl->GetDecoderDeviceGuids( p, a, b ) +#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->lpVtbl->GetDecoderRenderTargets( p, a, b, c ) +#else +#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoDecoderService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoDecoderService_Release( p ) (p)->Release() +#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->CreateVideoDecoder( a, b, c, d, e, f ) +#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->GetDecoderConfigurations( a, b, c, d, e ) +#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->GetDecoderDeviceGuids( a, b ) +#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->GetDecoderRenderTargets( a, b, c ) +#endif + +#undef INTERFACE +#define INTERFACE IDirect3DDeviceManager9 +DECLARE_INTERFACE_( IDirect3DDeviceManager9, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( ResetDevice ) ( THIS_ IDirect3DDevice9*, UINT ) PURE; + STDMETHOD( OpenDeviceHandle ) ( THIS_ HANDLE* ) PURE; + STDMETHOD( CloseDeviceHandle ) ( THIS_ HANDLE ) PURE; + STDMETHOD( TestDevice ) ( THIS_ HANDLE ) PURE; + STDMETHOD( LockDevice ) ( THIS_ HANDLE, IDirect3DDevice9**, BOOL ) PURE; + STDMETHOD( UnlockDevice ) ( THIS_ HANDLE, BOOL ) PURE; + STDMETHOD( GetVideoService ) ( THIS_ HANDLE, REFIID, void** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirect3DDeviceManager9_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirect3DDeviceManager9_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->lpVtbl->ResetDevice( p, a, b ) +#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->lpVtbl->OpenDeviceHandle( p, a ) +#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->lpVtbl->CloseDeviceHandle( p, a ) +#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->lpVtbl->TestDevice( p, a ) +#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->lpVtbl->LockDevice( p, a, b, c ) +#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->lpVtbl->UnlockDevice( p, a, b ) +#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->lpVtbl->GetVideoService( p, a, b, c ) +#else +#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirect3DDeviceManager9_AddRef( p ) (p)->AddRef() +#define IDirect3DDeviceManager9_Release( p ) (p)->Release() +#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->ResetDevice( a, b ) +#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->OpenDeviceHandle( a ) +#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->CloseDeviceHandle( a ) +#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->TestDevice( a ) +#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->LockDevice( a, b, c ) +#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->UnlockDevice( a, b ) +#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->GetVideoService( a, b, c ) +#endif + +typedef _COM_interface IDirectXVideoProcessorService IDirectXVideoProcessorService; +typedef _COM_interface IDirectXVideoProcessor IDirectXVideoProcessor; + +#undef INTERFACE +#define INTERFACE IDirectXVideoProcessor +DECLARE_INTERFACE_( IDirectXVideoProcessor, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( GetVideoProcessorService ) ( THIS_ IDirectXVideoProcessorService** ) PURE; + STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, D3DFORMAT*, UINT* ) PURE; + STDMETHOD( GetVideoProcessorCaps ) ( THIS_ DXVA2_VideoProcessorCaps* ) PURE; + STDMETHOD( GetProcAmpRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( GetFilterPropertyRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( VideoProcessBlt ) ( THIS_ IDirect3DSurface9*, DXVA2_VideoProcessBltParams*, DXVA2_VideoSample*, UINT, HANDLE* ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoProcessor_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoProcessor_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->lpVtbl->GetVideoProcessorService( p, a ) +#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d ) +#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->lpVtbl->GetVideoProcessorCaps( p, a ) +#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->lpVtbl->GetProcAmpRange( p, a, b ) +#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b ) +#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->lpVtbl->VideoProcessBlt( p, a, b, c, d, e ) +#else +#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoProcessor_AddRef( p ) (p)->AddRef() +#define IDirectXVideoProcessor_Release( p ) (p)->Release() +#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->GetVideoProcessorService( a ) +#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->GetCreationParameters( a, b, c, d ) +#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->GetVideoProcessorCaps( a ) +#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->GetProcAmpRange( a, b ) +#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->GetFilterPropertyRange( a, b ) +#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->VideoProcessBlt( a, b, c, d, e ) +#endif + + +#undef INTERFACE +#define INTERFACE IDirectXVideoProcessorService +DECLARE_INTERFACE_( IDirectXVideoProcessorService, IDirectXVideoAccelerationService ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + STDMETHOD( RegisterVideoProcessorSoftwareDevice ) ( THIS_ void* ) PURE; + STDMETHOD( GetVideoProcessorDeviceGuids ) ( THIS_ DXVA2_VideoDesc*, UINT, GUID** ) PURE; + STDMETHOD( GetVideoProcessorRenderTargets ) ( THIS_ REFGUID, DXVA2_VideoDesc*, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetVideoProcessorSubStreamFormats ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetVideoProcessorCaps ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, DXVA2_VideoProcessorCaps* ) PURE; + STDMETHOD( GetProcAmpRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( GetFilterPropertyRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( CreateVideoProcessor ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, IDirectXVideoProcessor** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoProcessorService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoProcessorService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->lpVtbl->RegisterVideoProcessorSoftwareDevice( p, a ) +#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->lpVtbl->GetVideoProcessorDeviceGuids( p, a, b, c ) +#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorRenderTargets( p, a, b, c, d ) +#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->lpVtbl->GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorCaps( p, a, b, c, d ) +#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->lpVtbl->GetProcAmpRange( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->lpVtbl->CreateVideoProcessor( p, a, b, c, d, e ) +#else +#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoProcessorService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoProcessorService_Release( p ) (p)->Release() +#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->RegisterVideoProcessorSoftwareDevice( a ) +#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->GetVideoProcessorDeviceGuids( a, b, c ) +#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->GetVideoProcessorRenderTargets( a, b, c, d ) +#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->GetVideoProcessorSubStreamFormats( a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->GetVideoProcessorCaps( a, b, c, d ) +#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->GetProcAmpRange( a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->GetFilterPropertyRange( a, b, c, d, e ) +#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->CreateVideoProcessor( a, b, c, d, e ) +#endif + + +/***************************************************************************************************** +************************DXVA Video Processor******************************************************** +*******************************************************************************************************/ + + + +/*#undef INTERFACE +#define INTERFACE IDirectXVideoService +DECLARE_INTERFACE_(IDirectXVideoService,IUnknown) +{ + STDMETHOD(DXVA2CreateVideoService)(IDirect3DDevice9*, REFIID, void**) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c) +#else +#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c) +#endif*/ + + +#ifdef __cplusplus +}; +#endif + +#ifdef __cplusplus +extern "C" HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *, + REFIID riid, + void **ppService ); +#else +extern HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *, + REFIID riid, + void **ppService ); +#endif + +typedef +enum _DXVA2_VideoChromaSubSampling +{ DXVA2_VideoChromaSubsamplingMask = 0xf, + DXVA2_VideoChromaSubsampling_Unknown = 0, + DXVA2_VideoChromaSubsampling_ProgressiveChroma = 0x8, + DXVA2_VideoChromaSubsampling_Horizontally_Cosited = 0x4, + DXVA2_VideoChromaSubsampling_Vertically_Cosited = 0x2, + DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes = 0x1, + DXVA2_VideoChromaSubsampling_MPEG2 = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes ), + DXVA2_VideoChromaSubsampling_MPEG1 = DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes, + DXVA2_VideoChromaSubsampling_DV_PAL = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ), + DXVA2_VideoChromaSubsampling_Cosited = ( ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ) | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes )} DXVA2_VideoChromaSubSampling; + +typedef +enum _DXVA2_NominalRange +{ DXVA2_NominalRangeMask = 0x7, + DXVA2_NominalRange_Unknown = 0, + DXVA2_NominalRange_Normal = 1, + DXVA2_NominalRange_Wide = 2, + DXVA2_NominalRange_0_255 = 1, + DXVA2_NominalRange_16_235 = 2, + DXVA2_NominalRange_48_208 = 3} DXVA2_NominalRange; + +typedef +enum _DXVA2_VideoLighting +{ DXVA2_VideoLightingMask = 0xf, + DXVA2_VideoLighting_Unknown = 0, + DXVA2_VideoLighting_bright = 1, + DXVA2_VideoLighting_office = 2, + DXVA2_VideoLighting_dim = 3, + DXVA2_VideoLighting_dark = 4} DXVA2_VideoLighting; + +typedef +enum _DXVA2_VideoPrimaries +{ DXVA2_VideoPrimariesMask = 0x1f, + DXVA2_VideoPrimaries_Unknown = 0, + DXVA2_VideoPrimaries_reserved = 1, + DXVA2_VideoPrimaries_BT709 = 2, + DXVA2_VideoPrimaries_BT470_2_SysM = 3, + DXVA2_VideoPrimaries_BT470_2_SysBG = 4, + DXVA2_VideoPrimaries_SMPTE170M = 5, + DXVA2_VideoPrimaries_SMPTE240M = 6, + DXVA2_VideoPrimaries_EBU3213 = 7, + DXVA2_VideoPrimaries_SMPTE_C = 8} DXVA2_VideoPrimaries; + +typedef +enum _DXVA2_VideoTransferFunction +{ DXVA2_VideoTransFuncMask = 0x1f, + DXVA2_VideoTransFunc_Unknown = 0, + DXVA2_VideoTransFunc_10 = 1, + DXVA2_VideoTransFunc_18 = 2, + DXVA2_VideoTransFunc_20 = 3, + DXVA2_VideoTransFunc_22 = 4, + DXVA2_VideoTransFunc_709 = 5, + DXVA2_VideoTransFunc_240M = 6, + DXVA2_VideoTransFunc_sRGB = 7, + DXVA2_VideoTransFunc_28 = 8} DXVA2_VideoTransferFunction; + +typedef +enum _DXVA2_SampleFormat +{ DXVA2_SampleFormatMask = 0xff, + DXVA2_SampleUnknown = 0, + DXVA2_SampleProgressiveFrame = 2, + DXVA2_SampleFieldInterleavedEvenFirst = 3, + DXVA2_SampleFieldInterleavedOddFirst = 4, + DXVA2_SampleFieldSingleEven = 5, + DXVA2_SampleFieldSingleOdd = 6, + DXVA2_SampleSubStream = 7} DXVA2_SampleFormat; + +typedef +enum _DXVA2_VideoTransferMatrix +{ DXVA2_VideoTransferMatrixMask = 0x7, + DXVA2_VideoTransferMatrix_Unknown = 0, + DXVA2_VideoTransferMatrix_BT709 = 1, + DXVA2_VideoTransferMatrix_BT601 = 2, + DXVA2_VideoTransferMatrix_SMPTE240M = 3} DXVA2_VideoTransferMatrix; + +enum __MIDL___MIDL_itf_dxva2api_0000_0000_0004 +{ DXVA2_NoiseFilterLumaLevel = 1, + DXVA2_NoiseFilterLumaThreshold = 2, + DXVA2_NoiseFilterLumaRadius = 3, + DXVA2_NoiseFilterChromaLevel = 4, + DXVA2_NoiseFilterChromaThreshold = 5, + DXVA2_NoiseFilterChromaRadius = 6, + DXVA2_DetailFilterLumaLevel = 7, + DXVA2_DetailFilterLumaThreshold = 8, + DXVA2_DetailFilterLumaRadius = 9, + DXVA2_DetailFilterChromaLevel = 10, + DXVA2_DetailFilterChromaThreshold = 11, + DXVA2_DetailFilterChromaRadius = 12}; + +enum __MIDL___MIDL_itf_dxva2api_0000_0000_0008 +{ DXVA2_VideoProcess_None = 0, + DXVA2_VideoProcess_YUV2RGB = 0x1, + DXVA2_VideoProcess_StretchX = 0x2, + DXVA2_VideoProcess_StretchY = 0x4, + DXVA2_VideoProcess_AlphaBlend = 0x8, + DXVA2_VideoProcess_SubRects = 0x10, + DXVA2_VideoProcess_SubStreams = 0x20, + DXVA2_VideoProcess_SubStreamsExtended = 0x40, + DXVA2_VideoProcess_YUV2RGBExtended = 0x80, + DXVA2_VideoProcess_AlphaBlendExtended = 0x100, + DXVA2_VideoProcess_Constriction = 0x200, + DXVA2_VideoProcess_NoiseFilter = 0x400, + DXVA2_VideoProcess_DetailFilter = 0x800, + DXVA2_VideoProcess_PlanarAlpha = 0x1000, + DXVA2_VideoProcess_LinearScaling = 0x2000, + DXVA2_VideoProcess_GammaCompensated = 0x4000, + DXVA2_VideoProcess_MaintainsOriginalFieldData = 0x8000, + DXVA2_VideoProcess_Mask = 0xffff}; + + + +__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ ); + +__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha(); + +__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ ); +#endif +#endif //_DXVA2API_H diff --git a/libhb/fifo.c b/libhb/fifo.c index be69616f1..bd1a45e40 100644 --- a/libhb/fifo.c +++ b/libhb/fifo.c @@ -242,7 +242,14 @@ void hb_buffer_pool_free( void ) if( b->data ) { freed += b->alloc; - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); } free( b ); count++; @@ -273,7 +280,7 @@ static hb_fifo_t *size_to_pool( int size ) return NULL; } -hb_buffer_t * hb_buffer_init( int size ) +hb_buffer_t * hb_buffer_init_internal( int size , int needsMapped ) { hb_buffer_t * b; // Certain libraries (hrm ffmpeg) expect buffers passed to them to @@ -288,6 +295,18 @@ hb_buffer_t * hb_buffer_init( int size ) { b = hb_fifo_get( buffer_pool ); +#ifdef USE_OPENCL + if (b && (needsMapped != 0) && (b->cl.buffer == NULL)) + { + // We need a mapped OpenCL buffer and that is not what we got out of the pool. + // Ditch it. It will get replaced with what we need. + if (b->data) + free(b->data); + free(b); + b = NULL; + } +#endif + if( b ) { /* @@ -295,6 +314,12 @@ hb_buffer_t * hb_buffer_init( int size ) * didn't have to do this. */ uint8_t *data = b->data; +#ifdef USE_OPENCL + cl_mem buffer = b->cl.buffer; + cl_event last_event = b->cl.last_event; + int loc = b->cl.buffer_location; +#endif + memset( b, 0, sizeof(hb_buffer_t) ); b->alloc = buffer_pool->buffer_size; b->size = size; @@ -302,6 +327,11 @@ hb_buffer_t * hb_buffer_init( int size ) b->s.start = -1; b->s.stop = -1; b->s.renderOffset = -1; +#ifdef USE_OPENCL + b->cl.buffer = buffer; + b->cl.last_event = last_event; + b->cl.buffer_location = loc; +#endif return( b ); } } @@ -320,6 +350,20 @@ hb_buffer_t * hb_buffer_init( int size ) if (size) { +#ifdef USE_OPENCL + b->cl.last_event = NULL; + b->cl.buffer_location = HOST; + + if (needsMapped != 0) + { + int status; + status = hb_cl_create_mapped_buffer(&b->cl.buffer, &b->data, b->alloc); + //hb_log("buf: %.16x -> buffer %.16x map %.16x size %d", b, b->cl.buffer, b->data, size); + } + else { + b->cl.buffer = NULL; +#endif + #if defined( SYS_DARWIN ) || defined( SYS_FREEBSD ) || defined( SYS_MINGW ) b->data = malloc( b->alloc ); #elif defined( SYS_CYGWIN ) @@ -328,6 +372,10 @@ hb_buffer_t * hb_buffer_init( int size ) #else b->data = memalign( 16, b->alloc ); #endif +#ifdef USE_OPENCL + } +#endif + if( !b->data ) { hb_log( "out of memory" ); @@ -344,6 +392,11 @@ hb_buffer_t * hb_buffer_init( int size ) return b; } +hb_buffer_t * hb_buffer_init( int size ) +{ + return hb_buffer_init_internal(size, 0); +} + void hb_buffer_realloc( hb_buffer_t * b, int size ) { if ( size > b->alloc || b->data == NULL ) @@ -361,6 +414,7 @@ void hb_buffer_realloc( hb_buffer_t * b, int size ) void hb_buffer_reduce( hb_buffer_t * b, int size ) { + if ( size < b->alloc / 8 || b->data == NULL ) { hb_buffer_t * tmp = hb_buffer_init( size ); @@ -374,6 +428,7 @@ void hb_buffer_reduce( hb_buffer_t * b, int size ) hb_buffer_t * hb_buffer_dup( const hb_buffer_t * src ) { + hb_buffer_t * buf; if ( src == NULL ) @@ -470,8 +525,11 @@ hb_buffer_t * hb_frame_buffer_init( int pix_fmt, int width, int height ) hb_image_height_stride( pix_fmt, height, p ); } } - +#ifdef USE_OPENCL + buf = hb_buffer_init_internal( size , hb_use_buffers() ); +#else buf = hb_buffer_init( size ); +#endif if( buf == NULL ) return NULL; @@ -524,12 +582,22 @@ void hb_buffer_swap_copy( hb_buffer_t *src, hb_buffer_t *dst ) uint8_t *data = dst->data; int size = dst->size; int alloc = dst->alloc; +#ifdef USE_OPENCL + cl_mem buffer = dst->cl.buffer; + cl_event last_event = dst->cl.last_event; + int loc = dst->cl.buffer_location; +#endif *dst = *src; src->data = data; src->size = size; src->alloc = alloc; +#ifdef USE_OPENCL + src->cl.buffer = buffer; + src->cl.last_event = last_event; + src->cl.buffer_location = loc; +#endif } // Frees the specified buffer list. @@ -557,7 +625,14 @@ void hb_buffer_close( hb_buffer_t ** _b ) // free the buf if( b->data ) { - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free2: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); hb_lock(buffers.lock); buffers.allocated -= b->alloc; hb_unlock(buffers.lock); diff --git a/libhb/hb.c b/libhb/hb.c index a9d9e417b..f6e3cb0e0 100644 --- a/libhb/hb.c +++ b/libhb/hb.c @@ -440,6 +440,11 @@ hb_handle_t * hb_init( int verbose, int update_check ) h->interjob = calloc( sizeof( hb_interjob_t ), 1 ); + /* opencl */ +#ifdef USE_OPENCL + //hb_opencl_init(); // FIXME: Ensure gui instances call this or hb_get_opencl_env() during startup if needed. +#endif + /* Start library thread */ hb_log( "hb_init: starting libhb thread" ); h->die = 0; diff --git a/libhb/hbffmpeg.h b/libhb/hbffmpeg.h index 36319de09..776eec61a 100644 --- a/libhb/hbffmpeg.h +++ b/libhb/hbffmpeg.h @@ -15,6 +15,7 @@ #include "libavutil/opt.h" #include "libswscale/swscale.h" #include "libavresample/avresample.h" +#include "common.h" #define HB_FFMPEG_THREADS_AUTO (-1) // let hb_avcodec_open() decide thread_count diff --git a/libhb/internal.h b/libhb/internal.h index 86878eb17..e4d5ed71c 100644 --- a/libhb/internal.h +++ b/libhb/internal.h @@ -121,6 +121,15 @@ struct hb_buffer_s void *filter_details; } qsv_details; +#ifdef USE_OPENCL + struct cl_data + { + cl_mem buffer; + cl_event last_event; + enum { HOST, DEVICE } buffer_location; + } cl; +#endif + // PICTURESUB subtitle packets: // Video packets (after processing by the hb_sync_video work-object): diff --git a/libhb/module.defs b/libhb/module.defs index b161f4c7f..e06c6331b 100644 --- a/libhb/module.defs +++ b/libhb/module.defs @@ -45,6 +45,14 @@ endif ifeq (1,$(FEATURE.fdk_aac)) LIBHB.GCC.D += USE_FDK_AAC endif +ifeq (1,$(FEATURE.opencl)) +LIBHB.GCC.D += USE_OPENCL +endif + +ifeq (1,$(FEATURE.hwd)) +LIBHB.GCC.D += USE_HWD +endif + ifeq (1,$(FEATURE.libav_aac)) LIBHB.GCC.D += USE_LIBAV_AAC endif @@ -63,6 +71,11 @@ endif LIBHB.GCC.D += __LIBHB__ USE_PTHREAD LIBHB.GCC.I += $(LIBHB.build/) $(CONTRIB.build/)include +ifeq (1,$(FEATURE.opencl)) +LIBHB.GCC.I += $(AMDAPPSDKROOT)/include +LIBHB.GCC.l += OpenCL +endif + ifeq ($(BUILD.system),cygwin) LIBHB.GCC.D += SYS_CYGWIN else ifeq ($(BUILD.system),darwin) diff --git a/libhb/oclnv12toyuv.c b/libhb/oclnv12toyuv.c new file mode 100644 index 000000000..19188e5a3 --- /dev/null +++ b/libhb/oclnv12toyuv.c @@ -0,0 +1,285 @@ +/* oclnv12toyuv.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + */ + +#ifdef USE_OPENCL +#ifdef USE_HWD +#include "vadxva2.h" +#include "oclnv12toyuv.h" + +/** + * It creates are opencl bufs w is input frame width, h is input frame height +*/ +static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * It creates are opencl kernel. kernel name is nv12toyuv +*/ +static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ); + +/** + * It set opencl arg, input data,output data, input width, output height +*/ +static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * It initialize nv12 to yuv kernel. +*/ +static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * Run nv12 to yuv kernel. + */ +static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ); + +/** + * register nv12 to yuv kernel. + */ +static int hb_nv12toyuv_reg_kernel( void ); + +/** + * It creates are opencl bufs w is input frame width, h is input frame height + */ +static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + cl_int status = CL_SUCCESS; + int in_bytes = w*h*3/2; + CREATEBUF( dxva2->cl_mem_nv12, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, in_bytes ); + CREATEBUF( dxva2->cl_mem_yuv, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, in_bytes ); + return 0; +} + +/** + * It creates are opencl kernel. kernel name is nv12toyuv + */ +static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ) +{ + int ret; + dxva2->nv12toyuv = clCreateKernel( kenv->program, "nv12toyuv", &ret ); + return ret; +} + +/** + * It set opencl arg, input data,output data, input width, output height + */ +static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + int arg = 0, status; + kenv->kernel = dxva2->nv12toyuv; + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_nv12 ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_yuv ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &w ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &h ); + return 0; +} + +/** + * It initialize nv12 to yuv kernel. + */ +static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + if( !dxva2->nv12toyuv ) + { + if( hb_nv12toyuv_create_cl_buf( kenv, w, h, dxva2 ) ) + { + hb_log( "OpenCL: nv12toyuv_create_cl_buf fail" ); + return -1; + } + if (!dxva2->nv12toyuv_tmp_in) + { + dxva2->nv12toyuv_tmp_in = malloc (w*h*3/2); + } + + if (!dxva2->nv12toyuv_tmp_out) + { + dxva2->nv12toyuv_tmp_out = malloc (w*h*3/2); + } + + hb_nv12toyuv_create_cl_kernel( kenv, dxva2 ); + } + return 0; +} + +/** + * copy_plane + * @param dst - + * @param src - + * @param dstride - + * @param sstride - + * @param h - + */ +static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, + int h ) +{ + if ( dstride == sstride ) + { + memcpy( dst, src, dstride * h ); + return dst + dstride * h; + } + + int lbytes = dstride <= sstride? dstride : sstride; + while ( --h >= 0 ) + { + memcpy( dst, src, lbytes ); + src += sstride; + dst += dstride; + } + + return dst; +} + +/** + * Run nv12 to yuv kernel. + */ +static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ) +{ + int status; + int w = (int)userdata[0]; + int h = (int)userdata[1]; + uint8_t *bufi1 = userdata[2]; + int *crop = userdata[3]; + hb_va_dxva2_t *dxva2 = userdata[4]; + + uint8_t *bufi2 = userdata[5]; + int p = (int)userdata[6]; + int decomb = (int)userdata[7]; + int detelecine = (int)userdata[8]; + int i; + if( hb_init_nv12toyuv_ocl( kenv, w, h, dxva2 ) ) + { + return -1; + } + + if( hb_nv12toyuv_setkernelarg( kenv, w, h, dxva2 ) ) + { + return -1; + } + + int in_bytes = w*h*3/2; + if( kenv->isAMD ) + { + void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_nv12, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, in_bytes, 0, NULL, NULL, NULL ); + + for ( i = 0; i < dxva2->height; i++ ) + { + memcpy( data + i * dxva2->width, bufi1 + i * p, dxva2->width ); + if ( i < dxva2->height >> 1 ) + { + memcpy( data + ( dxva2->width * dxva2->height ) + i * dxva2->width, bufi2 + i * p, dxva2->width ); + } + } + clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_nv12, data, 0, NULL, NULL ); + } + else + { + uint8_t *tmp = (uint8_t*)malloc( dxva2->width * dxva2->height * 3 / 2 ); + for( i = 0; i < dxva2->height; i++ ) + { + memcpy( tmp + i * dxva2->width, bufi1 + i * p, dxva2->width ); + if( i < dxva2->height >> 1 ) + { + memcpy( tmp + (dxva2->width * dxva2->height) + i * dxva2->width, bufi2 + i * p, dxva2->width ); + } + } + OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_nv12, CL_TRUE, 0, in_bytes, tmp, 0, NULL, NULL ); + free( tmp ); + } + + size_t gdim[2] = {w>>1, h>>1}; + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); + + if( (crop[0] || crop[1] || crop[2] || crop[3]) && (decomb == 0) && (detelecine == 0) ) + { + AVPicture pic_in; + AVPicture pic_crop; + clEnqueueReadBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_out, 0, NULL, NULL ); + hb_buffer_t *in = hb_video_buffer_init( w, h ); + + int wmp = in->plane[0].stride; + int hmp = in->plane[0].height; + copy_plane( in->plane[0].data, dxva2->nv12toyuv_tmp_out, wmp, w, hmp ); + wmp = in->plane[1].stride; + hmp = in->plane[1].height; + copy_plane( in->plane[1].data, dxva2->nv12toyuv_tmp_out + w * h, wmp, w>>1, hmp ); + wmp = in->plane[2].stride; + hmp = in->plane[2].height; + copy_plane( in->plane[2].data, dxva2->nv12toyuv_tmp_out + w * h +( ( w * h )>>2 ), wmp, w>>1, hmp ); + + hb_avpicture_fill( &pic_in, in ); + av_picture_crop( &pic_crop, &pic_in, in->f.fmt, crop[0], crop[2] ); + int i, ww = w - ( crop[2] + crop[3] ), hh = h - ( crop[0] + crop[1] ); + for( i = 0; i< hh >> 1; i++ ) + { + memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 0 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], ww ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 1 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], ww ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + i * ( ww >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], ww >> 1 ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + ( ( ww * hh )>>2 ) + i * ( ww >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], ww >> 1 ); + } + + if( kenv->isAMD ) + { + void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, ww * hh * 3 / 2, 0, NULL, NULL, NULL ); + memcpy( data, dxva2->nv12toyuv_tmp_in, ww * hh * 3 / 2 ); + clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_yuv, data, 0, NULL, NULL ); + } + else + { + OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_in, 0, NULL, NULL ); + } + + hb_buffer_close( &in ); + } + return 0; +} +/** + * register nv12 to yuv kernel. + */ +static int hb_nv12toyuv_reg_kernel( void ) +{ + int st = hb_register_kernel_wrapper( "nv12toyuv", hb_nv12toyuv ); + if( !st ) + { + hb_log( "OpenCL: register kernel[%s] failed", "nv12toyuv" ); + return -1; + } + return 0; +} +/** + * nv12 to yuv interface + * bufi is input frame of nv12, w is input frame width, h is input frame height + */ +int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine ) +{ + void *userdata[9]; + userdata[0] = (void*)w; + userdata[1] = (void*)h; + userdata[2] = bufi[0]; + userdata[3] = crop; + userdata[4] = dxva2; + userdata[5] = bufi[1]; + userdata[6] = (void*)p; + userdata[7] = decomb; + userdata[8] = detelecine; + + if( hb_nv12toyuv_reg_kernel() ) + { + return -1; + } + + if( hb_run_kernel( "nv12toyuv", userdata ) ) + { + hb_log( "OpenCL: run kernel[nv12toyuv] failed" ); + return -1; + } + return 0; +} +#endif +#endif diff --git a/libhb/oclnv12toyuv.h b/libhb/oclnv12toyuv.h new file mode 100644 index 000000000..86f7673bb --- /dev/null +++ b/libhb/oclnv12toyuv.h @@ -0,0 +1,35 @@ +/* oclnv12toyuv.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + +#ifdef USE_OPENCL +#ifndef RENDER_CL_H +#define RENDER_CL_H + +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#include "common.h" +#include "openclwrapper.h" + +/** + * nv12 to yuv interface + * bufi is input frame of nv12, w is input frame width, h is input frame height + */ +#ifdef USE_HWD +int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine ); +#endif +#endif +#endif diff --git a/libhb/oclscale.c b/libhb/oclscale.c new file mode 100644 index 000000000..904183340 --- /dev/null +++ b/libhb/oclscale.c @@ -0,0 +1,271 @@ +/* oclscale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifdef USE_OPENCL
+
+#include <math.h> +#include "common.h" +#include "openclwrapper.h" +#define FILTER_LEN 4 + +#define _A -0.5f + +cl_float cubic(cl_float x) +{ + if (x < 0) + x = -x; + + if (x < 1) + return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1; + else if (x < 2) + return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A); + else + return 0; +} + + +cl_float *hb_bicubic_weights(cl_float scale, int length) +{ + cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4); + + int i; // C rocks + cl_float *out = weights; + for (i = 0; i < length; ++i) + { + cl_float x = i / scale; + cl_float dx = x - (int)x; + *out++ = cubic(-dx - 1.0f); + *out++ = cubic(-dx); + *out++ = cubic(-dx + 1.0f); + *out++ = cubic(-dx + 2.0f); + } + return weights; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv); + +/** +* executive scale using opencl +* get filter args +* create output buffer +* create horizontal filter buffer +* create vertical filter buffer +* create kernels +*/ +int hb_ocl_scale_func( void **data, KernelEnv *kenv ) +{ + cl_int status; + + cl_mem in_buf = data[0]; + cl_mem out_buf = data[1]; + int crop_top = data[2]; + int crop_bottom = data[3]; + int crop_left = data[4]; + int crop_right = data[5]; + int in_frame_w = (int)data[6]; + int in_frame_h = (int)data[7]; + int out_frame_w = (int)data[8]; + int out_frame_h = (int)data[9]; + hb_oclscale_t *os = data[10]; + hb_buffer_t *in = data[11]; + hb_buffer_t *out = data[12]; + + if (os->initialized == 0) + { + hb_log( "Scaling With OpenCL" ); + if (kenv->isAMD != 0) + hb_log( "Using Zero Copy"); + // create the block kernel + cl_int status; + os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status ); + + os->initialized = 1; + } + + { + // Use the new kernel + cl_event events[5]; + int eventCount = 0; + + if (kenv->isAMD == 0) { + status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]); + status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]); + } + + cl_int srcPlaneOffset0 = in->plane[0].data - in->data; + cl_int srcPlaneOffset1 = in->plane[1].data - in->data; + cl_int srcPlaneOffset2 = in->plane[2].data - in->data; + cl_int srcRowWords0 = in->plane[0].stride; + cl_int srcRowWords1 = in->plane[1].stride; + cl_int srcRowWords2 = in->plane[2].stride; + cl_int dstPlaneOffset0 = out->plane[0].data - out->data; + cl_int dstPlaneOffset1 = out->plane[1].data - out->data; + cl_int dstPlaneOffset2 = out->plane[2].data - out->data; + cl_int dstRowWords0 = out->plane[0].stride; + cl_int dstRowWords1 = out->plane[1].stride; + cl_int dstRowWords2 = out->plane[2].stride; + + if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) { + srcPlaneOffset0 += crop_left + crop_top * srcRowWords0; + srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1; + srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2; + in_frame_w = in_frame_w - crop_right - crop_left; + in_frame_h = in_frame_h - crop_bottom - crop_top; + } + + cl_float xscale = (out_frame_w * 1.0f) / in_frame_w; + cl_float yscale = (out_frame_h * 1.0f) / in_frame_h; + setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv); + + OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch + OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights ); + OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights ); + + size_t workOffset[] = { 0, 0, 0 }; + size_t globalWorkSize[] = { 1, 1, 1 }; + size_t localWorkSize[] = { 1, 1, 1 }; + + int xgroups = (out_frame_w + 63) / 64; + int ygroups = (out_frame_h + 15) / 16; + + localWorkSize[0] = 64; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + globalWorkSize[0] = xgroups * 64; + globalWorkSize[1] = ygroups; + globalWorkSize[2] = 3; + + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] ); + ++eventCount; + + if (kenv->isAMD == 0) { + in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status); + out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status); + eventCount += 2; + } + + clFlush(kenv->command_queue); + clWaitForEvents(eventCount, &events[0]); + int i; + for (i = 0; i < eventCount; ++i) + clReleaseEvent(events[i]); + } + + return 1; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { + cl_int status; + if (os->xscale != xscale || os->width < width) { + cl_float *xweights = hb_bicubic_weights(xscale, width); + CL_FREE(os->bicubic_x_weights); + CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); + os->width = width; + os->xscale = xscale; + free(xweights); + } + + if ((os->yscale != yscale) || (os->height < height)) { + cl_float *yweights = hb_bicubic_weights(yscale, height); + CL_FREE(os->bicubic_y_weights); + CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); + os->height = height; + os->yscale = yscale; + free(yweights); + } + return 0; +} + + +/** +* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm +* parameter: +* inputFrameBuffer: the source video frame opencl buffer
+* outputdata: the destination video frame buffer
+* inputWidth: the width of the source video frame
+* inputHeight: the height of the source video frame
+* outputWidth: the width of destination video frame +* outputHeight: the height of destination video frame +*/ + + +static int s_scale_init_flag = 0; + +int do_scale_init() +{ + if ( s_scale_init_flag==0 ) + { + int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func ); + if( !st ) + { + hb_log( "register kernel[%s] failed", "frame_scale" ); + return 0; + } + s_scale_init_flag++; + } + return 1; +} + + +int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os) +{ + void *data[13]; + + if (do_scale_init() == 0) + return 0; + + data[0] = in->cl.buffer; + data[1] = out->cl.buffer; + data[2] = (void*)(crop[0]); + data[3] = (void*)(crop[1]); + data[4] = (void*)(crop[2]); + data[5] = (void*)(crop[3]); + data[6] = (void*)(in->f.width); + data[7] = (void*)(in->f.height); + data[8] = (void*)(out->f.width); + data[9] = (void*)(out->f.height); + data[10] = os; + data[11] = in; + data[12] = out; + + if( !hb_run_kernel( "frame_scale", data ) ) + hb_log( "run kernel[%s] failed", "frame_scale" ); + return 0; +} + + + + + +#endif diff --git a/libhb/openclkernels.h b/libhb/openclkernels.h new file mode 100644 index 000000000..f324aafdd --- /dev/null +++ b/libhb/openclkernels.h @@ -0,0 +1,771 @@ +/* openclkernels.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifndef USE_EXTERNAL_KERNEL
+
+#define KERNEL( ... )# __VA_ARGS__
+
+
+char *kernel_src_hscale = KERNEL (
+
+ typedef unsigned char fixed8;
+
+/*******************************************************************************************************
+dst: Horizontal scale destination;
+src: YUV content in opencl buf;
+hf_Y: Horizontal filter coefficients for Y planes;
+hf_UV: Horizontal filter coefficients for UV planes;
+hi_Y: Horizontal filter index for Y planes;
+hi_UV: Horizontal filter index for UV planes;
+stride: Src width;
+filter_len: Length of filter;
+********************************************************************************************************/
+ kernel void frame_h_scale (
+ global fixed8 *src,
+ global float *hf_Y,
+ global float *hf_UV,
+ global int *hi_Y,
+ global int *hi_UV,
+ global fixed8 *dst,
+ int stride, //src_width
+ int filter_len
+ )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int width = get_global_size( 0 );
+ int height = get_global_size( 1 );
+ float result_Y = 0, result_U = 0, result_V = 0;
+ int i = 0;
+
+ global fixed8 *src_Y = src;
+ global fixed8 *src_U = src_Y + stride * height;
+ global fixed8 *src_V = src_U + (stride >> 1) * (height >> 1);
+
+ global fixed8 *dst_Y = dst;
+ global fixed8 *dst_U = dst_Y + width * height;
+ global fixed8 *dst_V = dst_U + (width >> 1) * (height >> 1);
+
+ int xy = y * width + x;
+ global fixed8 *rowdata_Y = src_Y + (y * stride);
+ for( int i = 0; i < filter_len; i++ )
+ {
+ result_Y += ( hf_Y[x + i * width] * rowdata_Y[hi_Y[x] + i]);
+ }
+ dst_Y[xy] = result_Y;
+
+ if( y < (height >> 1) && x < (width >> 1) )
+ {
+ int xy = y * (width >> 1) + x;
+ global fixed8 *rowdata_U = src_U + (y * (stride >> 1));
+ global fixed8 *rowdata_V = src_V + (y * (stride >> 1));
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_U += ( hf_UV[x + i * (width >> 1)] * rowdata_U[hi_UV[x] + i]);
+ result_V += ( hf_UV[x + i * (width >> 1)] * rowdata_V[hi_UV[x] + i]);
+ }
+ dst_U[xy] = result_U;
+ dst_V[xy] = result_V;
+ }
+ }
+ );
+
+/*******************************************************************************************************
+dst: Vertical scale destination;
+src: YUV content in opencl buf;
+hf_Y: Vertical filter coefficients for Y planes;
+hf_UV: Vertical filter coefficients for UV planes;
+hi_Y: Vertical filter index for Y planes;
+hi_UV: Vertical filter index for UV planes;
+stride: Src height;
+filter_len: Length of filter;
+********************************************************************************************************/
+char *kernel_src_vscale = KERNEL (
+
+ kernel void frame_v_scale (
+ global fixed8 *src,
+ global float *vf_Y,
+ global float *vf_UV,
+ global int *vi_Y,
+ global int *vi_UV,
+ global fixed8 *dst,
+ int src_height,
+ int filter_len
+ )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int width = get_global_size( 0 );
+ int height = get_global_size( 1 );
+ float result_Y = 0, result_U = 0, result_V = 0;
+ int i = 0;
+
+ global fixed8 *src_Y = src;
+ global fixed8 *src_U = src_Y + src_height * width;
+ global fixed8 *src_V = src_U + (src_height >> 1) * (width >> 1);
+
+ global fixed8 *dst_Y = dst;
+ global fixed8 *dst_U = dst_Y + height * width;
+ global fixed8 *dst_V = dst_U + (height >> 1) * (width >> 1);
+
+ int xy = y * width + x;
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_Y += vf_Y[y + i * height] * src_Y[(vi_Y[y] + i) * width + x];
+ }
+ dst_Y[xy] = result_Y;
+
+ if( y < (height >> 1) && x < (width >> 1) )
+ {
+ int xy = y * (width >> 1) + x;
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_U += vf_UV[y + i * (height >> 1)] * src_U[(vi_UV[y] + i) * (width >> 1) + x];
+ result_V += vf_UV[y + i * (height >> 1)] * src_V[(vi_UV[y] + i) * (width >> 1) + x];
+ }
+ dst_U[xy] = result_U;
+ dst_V[xy] = result_V;
+ }
+ }
+ );
+
+/*******************************************************************************************************
+input: Input buffer;
+output: Output buffer;
+w: Width of frame;
+h: Height of frame;
+********************************************************************************************************/
+char *kernel_src_nvtoyuv = KERNEL (
+
+ kernel void nv12toyuv ( global char *input, global char* output, int w, int h )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int idx = y * (w >> 1) + x;
+ vstore4((vload4( 0, input + (idx << 2))), 0, output + (idx << 2)); //Y
+ char2 uv = vload2( 0, input + (idx << 1) + w * h );
+ output[idx + w * h] = uv.s0;
+ output[idx + w * h + ((w * h) >> 2)] = uv.s1;
+ }
+ );
+
+/*******************************************************************************************************
+dst: Horizontal scale destination;
+src: YUV content in opencl buf;
+yfilter: Opencl memory of horizontal filter coefficients for luma/alpha planes;
+yfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for luma/alpha planes;
+yfilterSize: Horizontal filter size for luma/alpha pixels;
+cfilter: Opencl memory of horizontal filter coefficients for chroma planes;
+cfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for chroma planes;
+cfilterSize: Horizontal filter size for chroma pixels;
+dstStride: Width of destination luma/alpha planes;
+dstChrStride: Width of destination chroma planes;
+********************************************************************************************************/
+
+char *kernel_src_hscaleall = KERNEL (
+
+ kernel void hscale_all_opencl (
+ global short *dst,
+ const global unsigned char *src,
+ const global short *yfilter,
+ const global int *yfilterPos,
+ int yfilterSize,
+ const global short *cfilter,
+ const global int *cfilterPos,
+ int cfilterSize,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+
+ int srcPos1 = h * srcStride + yfilterPos[w];
+ int srcPos2 = h * srcStride + yfilterPos[w + chrWidth];
+ int srcPos3 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w];
+ int srcPos4 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w + chrWidth];
+ int srcc1Pos = srcStride * srcHeight + (h) * (srcChrStride) + cfilterPos[w];
+ int srcc2Pos = srcc1Pos + ((srcChrStride)*(chrHeight));
+
+ int val1 = 0;
+ int val2 = 0;
+ int val3 = 0;
+ int val4 = 0;
+ int val5 = 0;
+ int val6 = 0;
+
+ int filterPos1 = yfilterSize * w;
+ int filterPos2 = yfilterSize * (w + chrWidth);
+ int cfilterPos1 = cfilterSize * w;
+
+ int j;
+ for (j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1 + j] * yfilter[filterPos1+ j];
+ val2 += src[srcPos2 + j] * yfilter[filterPos2 + j];
+ val3 += src[srcPos3 + j] * yfilter[filterPos1 + j];
+ val4 += src[srcPos4 + j] * yfilter[filterPos2 + j];
+ val5 += src[srcc1Pos+j] * cfilter[cfilterPos1 + j];
+ val6 += src[srcc2Pos+j] * cfilter[cfilterPos1 + j];
+ }
+ int dstPos1 = h *dstStride;
+ int dstPos2 = (h + chrHeight) * dstStride;
+
+ dst[dstPos1 + w] = ((val1 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val1 >> 7));
+ dst[dstPos1 + w + chrWidth] = ((val2 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val2 >> 7));
+ dst[dstPos2 + w] = ((val3 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val3 >> 7));
+ dst[dstPos2 + w + chrWidth] = ((val4 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val4 >> 7));
+
+ int dstPos3 = h * (dstChrStride) + w + dstStride * dstHeight;
+ int dstPos4 = h * (dstChrStride) + w + dstStride * dstHeight + ((dstChrStride) * chrHeight);
+ dst[dstPos3] = ((val5 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val5 >> 7));
+ dst[dstPos4] = ((val6 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val6 >> 7));
+ }
+ );
+
+char *kernel_src_hscalefast = KERNEL (
+
+ kernel void hscale_fast_opencl (
+ global short *dst,
+ const global unsigned char *src,
+ int xInc,
+ int chrXInc,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ int xpos1 = 0;
+ int xpos2 = 0;
+ int xx = xpos1 >> 16;
+ int xalpha = (xpos1 & 0xFFFF) >> 9;
+ dst[h * dstStride + w] = (src[h * srcStride + xx] << 7) + (src[h * srcStride + xx + 1] -src[h * srcStride + xx]) * xalpha;
+ int lowpart = h + (chrHeight);
+ dst[lowpart * dstStride + w] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
+
+ int inv_i = w * xInc >> 16;
+ if( inv_i >= srcWidth - 1)
+ {
+ dst[h*dstStride + w] = src[h*srcStride + srcWidth-1]*128;
+ dst[lowpart*dstStride + w] = src[lowpart*srcStride + srcWidth - 1] * 128;
+ }
+
+ int rightpart = w + (chrWidth);
+ xx = xpos2 >> 16;
+ xalpha = (xpos2 & 0xFFFF) >> 9;
+ dst[h * dstStride + rightpart] = (src[h *srcStride + xx] << 7) + (src[h * srcStride + xx + 1] - src[h * srcStride + xx]) * xalpha;
+ dst[lowpart * dstStride + rightpart] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
+ inv_i = rightpart * xInc >> 16;
+ if( inv_i >= srcWidth - 1)
+ {
+ dst[h * dstStride + rightpart] = src[h * srcStride + srcWidth - 1] * 128;
+ dst[lowpart * dstStride + rightpart] = src[lowpart * srcStride + srcWidth - 1] * 128;
+ }
+
+ int xpos = 0;
+ xpos = chrXInc * w;
+ xx = xpos >> 16;
+ xalpha = (xpos & 0xFFFF) >> 9;
+ src += srcStride * srcHeight;
+ dst += dstStride * dstHeight;
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1] * xalpha);
+ inv_i = w * xInc >> 16;
+ if( inv_i >= (srcWidth >> 1) - 1)
+ {
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1]*128;
+ }
+
+ xpos = chrXInc * (w);
+ xx = xpos >> 16;
+ src += srcChrStride * srcHeight >> 1;
+ dst += (dstChrStride * chrHeight);
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1 ] * xalpha);
+
+ if( inv_i >= (srcWidth >> 1) - 1)
+ {
+ //v channel:
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1] * 128;
+ }
+ }
+ );
+
+char *kernel_src_vscalealldither = KERNEL (
+
+ kernel void vscale_all_dither_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global short *yfilter,
+ int yfilterSize,
+ const global short *cfilter,
+ int cfilterSize,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_dither_8x8_128[8][8] = {
+ { 36, 68, 60, 92, 34, 66, 58, 90, },
+ { 100, 4, 124, 28, 98, 2, 122, 26, },
+ { 52, 84, 44, 76, 50, 82, 42, 74, },
+ { 116, 20, 108, 12, 114, 18, 106, 10, },
+ { 32, 64, 56, 88, 38, 70, 62, 94, },
+ { 96, 0, 120, 24, 102, 6, 126, 30, },
+ { 48, 80, 40, 72, 54, 86, 46, 78, },
+ { 112, 16, 104, 8, 118, 22, 110, 14, },
+ };
+
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_dither_8x8_128[h & 7];
+ local_down_dither = hb_dither_8x8_128[(h + chrHeight) & 7];
+
+ //yscale;
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
+
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
+ int val3 = (local_down_dither[w &7] << 12);
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
+ int val5 = (local_up_dither[w & 7] << 12);
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
+
+ int j;
+ int filterPos1 = h * yfilterSize;
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;
+ for(j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];
+ srcPos1 += srcStride;
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];
+ srcPos2 += srcStride;
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];
+ srcPos3 += srcStride;
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];
+ srcPos4 += srcStride;
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];
+ src1Pos += dstChrStride;
+ src2Pos += dstChrStride;
+ }
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));
+
+ int dst1Pos = dstStride * dstHeight + h*(dstChrStride)+(w);
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
+ }
+ );
+
+char *kernel_src_vscaleallnodither = KERNEL (
+
+ kernel void vscale_all_nodither_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global short *yfilter,
+ int yfilterSize,
+ const global short *cfilter,
+ int cfilterSize,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_sws_pb_64[8] = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_sws_pb_64;
+ local_down_dither = hb_sws_pb_64;
+
+
+ //yscale;
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
+
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
+ int val3 = (local_down_dither[w &7] << 12);
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
+ int val5 = (local_up_dither[w & 7] << 12);
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
+
+
+ int j;
+ int filterPos1 = h * yfilterSize;
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;
+ for(j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];
+ srcPos1 += srcStride;
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];
+ srcPos2 += srcStride;
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];
+ srcPos3 += srcStride;
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];
+ srcPos4 += srcStride;
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];
+ src1Pos += dstChrStride;
+ src2Pos += dstChrStride;
+ }
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));;
+
+ int dst1Pos = dstStride * dstHeight + h * (dstChrStride) + (w);
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
+ }
+ );
+
+char *kernel_src_vscalefast = KERNEL (
+
+ kernel void vscale_fast_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_sws_pb_64[8] = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_sws_pb_64;
+ local_down_dither = hb_sws_pb_64;
+
+
+ int rightpart = w + chrWidth;
+ int bh = h + chrHeight; // bottom part
+ short val1 = (src[(yfilterPos[h]) * dstStride + w] + local_up_dither[(w + 0) & 7]) >> 7; //lum offset is 0;
+ short val2 = (src[(yfilterPos[h]) * dstStride + rightpart] + local_up_dither[rightpart & 7]) >> 7;
+ short val3 = (src[(yfilterPos[bh]) * dstStride + w] + local_down_dither[w & 7]) >> 7;
+ short val4 = (src[(yfilterPos[bh]) * dstStride + rightpart] + local_down_dither[rightpart & 7]) >> 7;
+ dst[h * dstStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+ dst[h * dstStride + rightpart] = ((val2&(~0xFF)) ? ((-val2) >> 31) : (val2));
+ dst[bh * dstStride + w] = ((val3&(~0xFF)) ? ((-val3) >> 31) : (val3));
+ dst[bh * dstStride + rightpart] = ((val4&(~0xFF)) ? ((-val4) >> 31) : (val4));
+
+ src += dstStride * srcHeight;
+ dst += dstStride * dstHeight;
+ val1 = (src[cfilterPos[h] * (dstChrStride) + w] + local_up_dither[ w & 7]) >> 7;
+ dst[h * (dstChrStride) + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+
+ src += dstChrStride * (srcHeight >> 1);
+ dst += dstChrStride * chrHeight;
+ val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7;
+ dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+
+ } + ); + +char *kernel_src_scale = KERNEL ( + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst, + __global const uchar *src, + const float xscale, + const float yscale, + const int srcPlaneOffset0, + const int srcPlaneOffset1, + const int srcPlaneOffset2, + const int dstPlaneOffset0, + const int dstPlaneOffset1, + const int dstPlaneOffset2, + const int srcRowWords0, + const int srcRowWords1, + const int srcRowWords2, + const int dstRowWords0, + const int dstRowWords1, + const int dstRowWords2, + const int srcWidth, + const int srcHeight, + const int dstWidth, + const int dstHeight, + __global const float4* restrict xweights, + __global const float4* restrict yweights + ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Abort work items outside the dst image bounds. + + if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1)))) + return; + + const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2); + const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2); + const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2); + const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2); + + __local uchar pixels[64 * 36]; + const int localRowPixels = 64; + const int groupHeight = 16; // src pixel height output by the workgroup + const int ypad = 2; + const int localx = get_local_id(0); + + const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale); + const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad; + + float4 weights = xweights[x]; + int4 woffs = floor(x / xscale); + woffs += (int4)(-1, 0, 1, 2); + woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1); + const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1; + + // Scale x from global into LDS + + for (int i = 0; i <= globalRowCount; ++i) { + int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords; + offs += woffs; + pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights, + (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f)); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scale y from LDS into global + + if (x >= dstWidth >> ((z == 0) ? 0 : 1)) + return; + + int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords; + + for (int i = 0; i < groupHeight; ++i) { + if (y >= dstHeight >> ((z == 0) ? 0 : 1)) + break; + int localy = floor((get_group_id(1) * groupHeight + i) / yscale); + localy = localy - globalStartRow + ypad; + int loff = localx + localy * localRowPixels; + dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i], + (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels] + , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f)); + off += dstRowWords; + } +} +); + + +char *kernel_src_yadif_filter = KERNEL( + void filter_v6( + global unsigned char *dst, + global unsigned char *prev,
+ global unsigned char *cur,
+ global unsigned char *next,
+ int x,
+ int y,
+ int width,
+ int height,
+ int parity,
+ int inlinesize,
+ int outlinesize,
+ int inmode,
+ int uvflag
+ )
+ {
+
+ int flag = uvflag * (y >=height) * height;
+ int prefs = select(-(inlinesize), inlinesize,((y+1) - flag) <height);
+ int mrefs = select(inlinesize, -(inlinesize),y - flag);
+ int mode = select(inmode,2,(y - flag==1) || (y - flag + 2==height));
+ int score;
+
+ global unsigned char *prev2 = parity ? prev : cur ;
+ global unsigned char *next2 = parity ? cur : next;
+ int index = x + y * inlinesize;
+ int outindex = x + y * outlinesize;
+ int c = cur[index + mrefs];
+ int d = (prev2[index] + next2[index])>>1;
+ int e = cur[index + prefs];
+ int temporal_diff0 = abs((prev2[index]) - (next2[index]));
+ int temporal_diff1 =(abs(prev[index + mrefs] - c) + abs(prev[index + prefs] - e) )>>1;
+ int temporal_diff2 =(abs(next[index + mrefs] - c) + abs(next[index + prefs] - e) )>>1;
+ int diff = max(max(temporal_diff0>>1, temporal_diff1), temporal_diff2);
+ int spatial_pred = (c+e)>>1;
+ int spatial_score = abs(cur[index + mrefs-1] - cur[index + prefs-1]) + abs(c-e) + abs(cur[index + mrefs+1] - cur[index + prefs+1]) - 1;
+ //check -1
+ score = abs(cur[index + mrefs-2] - cur[index + prefs])
+ + abs(cur[index + mrefs-1] - cur[index + prefs+1])
+ + abs(cur[index + mrefs] - cur[index + prefs+2]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs-1] + cur[index + prefs+1])>>1;
+ }
+ //check -2
+ score = abs(cur[index + mrefs-3] - cur[index + prefs+1])
+ + abs(cur[index + mrefs-2] - cur[index + prefs+2])
+ + abs(cur[index + mrefs-1] - cur[index + prefs+3]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs-2] + cur[index + prefs+2])>>1;
+ }
+ //check 1
+ score = abs(cur[index + mrefs] - cur[index + prefs-2])
+ + abs(cur[index + mrefs+1] - cur[index + prefs-1])
+ + abs(cur[index + mrefs+2] - cur[index + prefs]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs+1] + cur[index + prefs-1])>>1;
+ }
+ //check 2
+ score = abs(cur[index + mrefs+1] - cur[index + prefs-3])
+ + abs(cur[index + mrefs+2] - cur[index + prefs-2])
+ + abs(cur[index + mrefs+3] - cur[index + prefs-1]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs+2] + cur[index + prefs-2])>>1;
+ }
+ if (mode < 2)
+ {
+ int b = (prev2[index + (mrefs<<1)] + next2[index + (mrefs<<1)])>>1;
+ int f = (prev2[index + (prefs<<1)] + next2[index + (prefs<<1)])>>1;
+ int diffmax = max(max(d-e, d-c), min(b-c, f-e));
+ int diffmin = min(min(d-e, d-c), max(b-c, f-e));
+
+ diff = max(max(diff, diffmin), -diffmax);
+ }
+ if (spatial_pred > d + diff)
+ {
+ spatial_pred = d + diff;
+ }
+ else if (spatial_pred < d - diff)
+ {
+ spatial_pred = d - diff;
+ }
+
+ dst[outindex] = spatial_pred;
+ }
+
+ kernel void yadif_filter(
+ global unsigned char *dst,
+ global unsigned char *prev,
+ global unsigned char *cur,
+ global unsigned char *next,
+ int parity,
+ int inlinesizeY,
+ int inlinesizeUV,
+ int outlinesizeY,
+ int outlinesizeUV,
+ int mode)
+ {
+ int x=get_global_id(0);
+ int y=(get_global_id(1)<<1) + (!parity);
+ int width=(get_global_size(0)<<1)/3;
+ int height=get_global_size(1)<<1;
+
+
+ global unsigned char *dst_Y=dst;
+ global unsigned char *dst_U=dst_Y+height*outlinesizeY;
+
+ global unsigned char *prev_Y=prev;
+ global unsigned char *prev_U=prev_Y+height*inlinesizeY;
+
+ global unsigned char *cur_Y=cur;
+ global unsigned char *cur_U=cur_Y+height*inlinesizeY;
+
+ global unsigned char *next_Y=next;
+ global unsigned char *next_U=next_Y+height*inlinesizeY;
+
+ if(x < width)
+ {
+ filter_v6(dst_Y,prev_Y,cur_Y,next_Y,x,y,width,height,parity,inlinesizeY,outlinesizeY,mode,0);
+ }
+ else
+ {
+ x = x - width;
+ filter_v6(dst_U,prev_U,cur_U,next_U,x,y,width>>1,height>>1,parity,inlinesizeUV,outlinesizeUV,mode,1);
+ }
+ }
+ );
+
+#endif
diff --git a/libhb/openclwrapper.c b/libhb/openclwrapper.c new file mode 100644 index 000000000..9a7e9888d --- /dev/null +++ b/libhb/openclwrapper.c @@ -0,0 +1,1261 @@ +/* openclwrapper.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+ */
+
+#ifdef USE_OPENCL
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "openclwrapper.h"
+#include "openclkernels.h"
+
+//#define USE_EXTERNAL_KERNEL
+#ifdef SYS_MINGW
+#include <windows.h>
+#endif
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#if defined(_MSC_VER)
+#define strcasecmp strcmpi
+#endif
+
+#define MAX_KERNEL_STRING_LEN 64
+#define MAX_CLFILE_NUM 50
+#define MAX_CLKERNEL_NUM 200
+#define MAX_CLFILE_PATH 255
+#define MAX_KERNEL_NUM 50
+#define MAX_KERNEL_NAME_LEN 64
+
+#ifndef INVALID_HANDLE_VALUE
+#define INVALID_HANDLE_VALUE NULL
+#endif
+
+//#define THREAD_PRIORITY_TIME_CRITICAL 15
+
+enum VENDOR
+{
+ AMD = 0,
+ Intel,
+ NVIDIA,
+ others
+};
+typedef struct _GPUEnv
+{
+ //share vb in all modules in hb library
+ cl_platform_id platform;
+ cl_device_type dType;
+ cl_context context;
+ cl_device_id * devices;
+ cl_device_id dev;
+ cl_command_queue command_queue;
+ cl_kernel kernels[MAX_CLFILE_NUM];
+ cl_program programs[MAX_CLFILE_NUM]; //one program object maps one kernel source file
+ char kernelSrcFile[MAX_CLFILE_NUM][256]; //the max len of kernel file name is 256
+ int file_count; // only one kernel file
+
+ char kernel_names[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN+1];
+ cl_kernel_function kernel_functions[MAX_CLKERNEL_NUM];
+ int kernel_count;
+ int isUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
+ enum VENDOR vendor;
+}GPUEnv;
+
+typedef struct
+{
+ char kernelName[MAX_KERNEL_NAME_LEN+1];
+ char * kernelStr;
+}hb_kernel_node;
+ +static GPUEnv gpu_env; +static int isInited = 0; +static int useBuffers = 0; +static hb_kernel_node gKernels[MAX_KERNEL_NUM]; + +#define ADD_KERNEL_CFG( idx, s, p ){\ + strcpy( gKernels[idx].kernelName, s );\
+ gKernels[idx].kernelStr = p;\
+ strcpy( gpu_env.kernel_names[idx], s );\
+ gpu_env.kernel_count++; }
+
+
+/**
+ * hb_confirm_gpu_type
+ */
+int hb_confirm_gpu_type()
+{
+ int status = 1;
+ unsigned int i, j;
+ cl_uint numPlatforms = 0;
+ status = clGetPlatformIDs(0,NULL,&numPlatforms);
+ if(status != 0)
+ {
+ goto end;
+ }
+ if(numPlatforms > 0)
+ {
+ cl_platform_id* platforms = (cl_platform_id* )malloc (numPlatforms * sizeof(cl_platform_id));
+ status = clGetPlatformIDs (numPlatforms, platforms, NULL);
+ if (status != 0)
+ {
+ goto end;
+ }
+ for (i=0; i < numPlatforms; i++)
+ {
+ char pbuff[100];
+ cl_uint numDevices;
+ status = clGetPlatformInfo( platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof (pbuff),
+ pbuff,
+ NULL);
+ if (status)
+ continue;
+ status = clGetDeviceIDs( platforms[i],
+ CL_DEVICE_TYPE_GPU ,
+ 0 ,
+ NULL ,
+ &numDevices);
+
+ cl_device_id *devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
+ status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+ for (j = 0; j < numDevices; j++)
+ {
+ char dbuff[100];
+ status = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(dbuff), dbuff, NULL);
+ if (!strcmp(dbuff, "Advanced Micro Devices, Inc.") ||
+ !strcmp(dbuff, "Intel(R) Corporation") ||
+#ifdef __APPLE__
+ !strcmp(dbuff, "AMD") ||
+ /* MacBook Pro, AMD ATI Radeon HD 6750M, OS X 10.8.3 */
+ !strcmp(dbuff, "NVIDIA") ||
+ /* MacBook Pro, NVIDIA GeForce GT 330M, OS X 10.7.4 */
+#endif
+ !strcmp(dbuff, "NVIDIA Corporation"))
+ {
+ return 0;
+ }
+ }
+
+ if ( status != CL_SUCCESS )
+ continue;
+ if( numDevices )
+ break;
+ }
+ free( platforms );
+ }
+ end:
+ return -1;
+}
+
+/**
+ * hb_regist_opencl_kernel
+ */ +int hb_regist_opencl_kernel() +{ + //if( !gpu_env.isUserCreated ) + // memset( &gpu_env, 0, sizeof(gpu_env) ); + //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources. + + gpu_env.file_count = 0; //argc; + gpu_env.kernel_count = 0UL; + + ADD_KERNEL_CFG( 0, "frame_scale", NULL ) + ADD_KERNEL_CFG( 1, "yadif_filter", NULL ) + + return 0; +} +
+/**
+ * hb_regist_opencl_kernel
+ * @param filename -
+ * @param source -
+ * @param gpu_info -
+ * @param int idx -
+ */
+int hb_convert_to_string( const char *filename, char **source, GPUEnv *gpu_info, int idx )
+{
+ int file_size;
+ size_t result;
+ FILE * file = NULL;
+ file_size = 0;
+ result = 0;
+ file = fopen( filename, "rb+" );
+
+ if( file!=NULL )
+ {
+ fseek( file, 0, SEEK_END );
+
+ file_size = ftell( file );
+ rewind( file );
+ *source = (char*)malloc( sizeof(char) * file_size + 1 );
+ if( *source == (char*)NULL )
+ {
+ return(0);
+ }
+ result = fread( *source, 1, file_size, file );
+ if( result != file_size )
+ {
+ free( *source );
+ return(0);
+ }
+ (*source)[file_size] = '\0';
+ fclose( file );
+
+ return(1);
+ }
+ return(0);
+}
+
+/**
+ * hb_binary_generated
+ * @param context -
+ * @param cl_file_name -
+ * @param fhandle -
+ */
+int hb_binary_generated( cl_context context, const char * cl_file_name, FILE ** fhandle )
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ cl_device_id *devices;
+ char * str = NULL;
+ FILE * fd = NULL;
+
+ status = clGetContextInfo( context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Get context info failed" );
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ {
+ hb_log( "OpenCL: No device found" );
+ return 0;
+ }
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo( context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+
+ status = 0;
+ /* dump out each binary into its own separate file. */
+ for (i = 0; i < numDevices; i++)
+ {
+ char fileName[256] = { 0 };
+ char cl_name[128] = { 0 };
+ if (devices[i])
+ {
+ char deviceName[1024];
+ status = clGetDeviceInfo(devices[i],
+ CL_DEVICE_NAME,
+ sizeof(deviceName),
+ deviceName,
+ NULL);
+
+ str = (char*)strstr(cl_file_name, ".cl");
+ memcpy(cl_name, cl_file_name, str - cl_file_name);
+ cl_name[str - cl_file_name] = '\0';
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);
+ fd = fopen(fileName, "rb");
+ status = fd != NULL;
+ }
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+
+ if( fd != NULL )
+ *fhandle = fd;
+
+ return status;
+}
+
+/**
+ * hb_write_binary_to_file
+ * @param fileName -
+ * @param birary -
+ * @param numBytes -
+ */
+int hb_write_binary_to_file( const char* fileName, const char* birary, size_t numBytes )
+{
+ FILE *output = NULL;
+ output = fopen( fileName, "wb" );
+ if( output == NULL )
+ return 0;
+
+ fwrite( birary, sizeof(char), numBytes, output );
+ fclose( output );
+
+ return 1;
+}
+
+/**
+ * hb_generat_bin_from_kernel_source
+ * @param program -
+ * @param cl_file_name -
+ */
+int hb_generat_bin_from_kernel_source( cl_program program, const char * cl_file_name )
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ size_t *binarySizes;
+ cl_device_id *devices;
+ char **binaries;
+ char *str = NULL;
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_NUM_DEVICES failed");
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: no device found");
+ return 0;
+ }
+
+ /* grab the handles to all of the devices in the program. */
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_DEVICES failed");
+ return 0;
+ }
+
+ /* figure out the sizes of each of the binaries. */
+ binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_BINARY_SIZES,
+ sizeof(size_t) * numDevices,
+ binarySizes, NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARY_SIZES failed");
+ return 0;
+ }
+
+ /* copy over all of the generated binaries. */
+ binaries = (char**)malloc( sizeof(char *) * numDevices );
+ if( binaries == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries failed");
+ return 0;
+ }
+
+ for( i = 0; i < numDevices; i++ )
+ {
+ if( binarySizes[i] != 0 )
+ {
+ binaries[i] = (char*)malloc( sizeof(char) * binarySizes[i] );
+ if( binaries[i] == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries[%d] failed", i);
+ return 0;
+ }
+ }
+ else
+ {
+ binaries[i] = NULL;
+ }
+ }
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_BINARIES,
+ sizeof(char *) * numDevices,
+ binaries,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARIES failed");
+ return 0;
+ }
+
+ /* dump out each binary into its own separate file. */
+ for (i = 0; i < numDevices; i++)
+ {
+ char fileName[256] = {0};
+ char cl_name[128] = {0};
+ if (binarySizes[i])
+ {
+ char deviceName[1024];
+ status = clGetDeviceInfo(devices[i],
+ CL_DEVICE_NAME,
+ sizeof(deviceName),
+ deviceName,
+ NULL);
+
+ str = (char*)strstr( cl_file_name, (char*)".cl" );
+ memcpy(cl_name, cl_file_name, str - cl_file_name);
+ cl_name[str - cl_file_name] = '\0';
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);
+
+ if (!hb_write_binary_to_file(fileName, binaries[i], binarySizes[i]))
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: unable to write kernel, writing to temporary directory instead.");
+ return 0;
+ }
+ }
+ }
+
+ // Release all resouces and memory
+ for( i = 0; i < numDevices; i++ )
+ {
+ if( binaries[i] != NULL )
+ {
+ free( binaries[i] );
+ binaries[i] = NULL;
+ }
+ }
+
+ if( binaries != NULL )
+ {
+ free( binaries );
+ binaries = NULL;
+ }
+
+ if( binarySizes != NULL )
+ {
+ free( binarySizes );
+ binarySizes = NULL;
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+ return 1;
+}
+
+
+/**
+ * hb_init_opencl_attr
+ * @param env -
+ */
+int hb_init_opencl_attr( OpenCLEnv * env )
+{
+ if( gpu_env.isUserCreated )
+ return 1;
+
+ gpu_env.context = env->context;
+ gpu_env.platform = env->platform;
+ gpu_env.dev = env->devices;
+ gpu_env.command_queue = env->command_queue;
+
+ gpu_env.isUserCreated = 1;
+
+ return 0;
+}
+
+/**
+ * hb_create_kernel
+ * @param kernelname -
+ * @param env -
+ */
+int hb_create_kernel( char * kernelname, KernelEnv * env )
+{
+ int status;
+ env->kernel = clCreateKernel( gpu_env.programs[0], kernelname, &status );
+ env->context = gpu_env.context;
+ env->command_queue = gpu_env.command_queue;
+ return status != CL_SUCCESS ? 1 : 0;
+}
+
+/**
+ * hb_release_kernel
+ * @param env -
+ */
+int hb_release_kernel( KernelEnv * env )
+{
+ int status = clReleaseKernel( env->kernel );
+ return status != CL_SUCCESS ? 1 : 0;
+}
+
+/**
+ * hb_init_opencl_env + * @param gpu_info - + */ + +static int init_once = 0; +int hb_init_opencl_env( GPUEnv *gpu_info ) +{ + size_t length; + cl_int status;
+ cl_uint numPlatforms, numDevices;
+ cl_platform_id *platforms;
+ cl_context_properties cps[3];
+ char platformName[100];
+ unsigned int i; + void *handle = INVALID_HANDLE_VALUE; + + + if (init_once != 0) + return 0; + else + init_once = 1; + /* + * Have a look at the available platforms. + */ + if( !gpu_info->isUserCreated )
+ {
+ status = clGetPlatformIDs( 0, NULL, &numPlatforms );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: OpenCL device platform not found." );
+ return(1);
+ }
+
+ gpu_info->platform = NULL;
+ if( 0 < numPlatforms )
+ {
+ platforms = (cl_platform_id*)malloc(
+ numPlatforms * sizeof(cl_platform_id));
+ if( platforms == (cl_platform_id*)NULL )
+ {
+ return(1);
+ }
+ status = clGetPlatformIDs( numPlatforms, platforms, NULL );
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Specific opencl platform not found." );
+ return(1);
+ }
+
+ for( i = 0; i < numPlatforms; i++ )
+ {
+ status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
+ sizeof(platformName), platformName,
+ NULL );
+
+ if( status != CL_SUCCESS )
+ {
+ continue;
+ }
+ gpu_info->platform = platforms[i];
+
+ if (!strcmp(platformName, "Advanced Micro Devices, Inc.") ||
+ !strcmp(platformName, "AMD"))
+ gpu_info->vendor = AMD;
+ else
+ gpu_info->vendor = others;
+
+ gpu_info->platform = platforms[i];
+
+ status = clGetDeviceIDs( gpu_info->platform /* platform */,
+ CL_DEVICE_TYPE_GPU /* device_type */,
+ 0 /* num_entries */,
+ NULL /* devices */,
+ &numDevices );
+
+ if( status != CL_SUCCESS )
+ {
+ continue;
+ }
+
+ if( numDevices )
+ break;
+
+ }
+ free( platforms );
+ }
+
+ if( NULL == gpu_info->platform )
+ {
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );
+ return(1);
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );
+ return(1);
+ }
+
+ /*
+ * Use available platform.
+ */
+ cps[0] = CL_CONTEXT_PLATFORM;
+ cps[1] = (cl_context_properties)gpu_info->platform;
+ cps[2] = 0;
+ /* Check for GPU. */
+ gpu_info->dType = CL_DEVICE_TYPE_GPU;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ gpu_info->dType = CL_DEVICE_TYPE_CPU;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+ }
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ gpu_info->dType = CL_DEVICE_TYPE_DEFAULT;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+ }
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ hb_log( "OpenCL: Unable to create opencl context." );
+ return(1);
+ }
+
+ /* Detect OpenCL devices. */
+ /* First, get the size of device list data */
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES,
+ 0, NULL, &length );
+ if((status != CL_SUCCESS) || (length == 0))
+ {
+ hb_log( "OpenCL: Unable to get the list of devices in context." );
+ return(1);
+ }
+
+ /* Now allocate memory for device list based on the size we got earlier */
+ gpu_info->devices = (cl_device_id*)malloc( length );
+ if( gpu_info->devices == (cl_device_id*)NULL )
+ {
+ return(1);
+ }
+
+ /* Now, get the device list data */
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES, length,
+ gpu_info->devices, NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to get the device list data in context." );
+ return(1);
+ }
+
+ /* Create OpenCL command queue. */
+ gpu_info->command_queue = clCreateCommandQueue( gpu_info->context,
+ gpu_info->devices[0],
+ 0, &status );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to create opencl command queue." );
+ return(1);
+ }
+ }
+
+ if( clGetCommandQueueInfo( gpu_info->command_queue,
+ CL_QUEUE_THREAD_HANDLE_AMD, sizeof(handle),
+ &handle, NULL ) == CL_SUCCESS && handle != INVALID_HANDLE_VALUE )
+ {
+#ifdef SYS_MINGW
+ SetThreadPriority( handle, THREAD_PRIORITY_TIME_CRITICAL );
+#endif
+ }
+
+ return 0;
+}
+
+
+/**
+ * hb_release_opencl_env
+ * @param gpu_info -
+ */
+int hb_release_opencl_env( GPUEnv *gpu_info )
+{
+ if( !isInited )
+ return 1;
+ int i;
+
+ for( i = 0; i<gpu_env.file_count; i++ )
+ {
+ if( gpu_env.programs[i] ) ;
+ {
+ clReleaseProgram( gpu_env.programs[i] );
+ gpu_env.programs[i] = NULL;
+ }
+ }
+
+ if( gpu_env.command_queue )
+ {
+ clReleaseCommandQueue( gpu_env.command_queue );
+ gpu_env.command_queue = NULL;
+ }
+
+ if( gpu_env.context )
+ {
+ clReleaseContext( gpu_env.context );
+ gpu_env.context = NULL;
+ }
+
+ isInited = 0;
+ gpu_info->isUserCreated = 0;
+ return 1;
+}
+
+
+/**
+ * hb_register_kernel_wrapper
+ * @param kernel_name -
+ * @param function -
+ */
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function )
+{
+ int i;
+ for( i = 0; i < gpu_env.kernel_count; i++ )
+ {
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
+ {
+ gpu_env.kernel_functions[i] = function;
+ return(1);
+ }
+ }
+ return(0);
+}
+
+/**
+ * hb_cached_of_kerner_prg
+ * @param gpu_env -
+ * @param cl_file_name -
+ */
+int hb_cached_of_kerner_prg( const GPUEnv *gpu_env, const char * cl_file_name )
+{
+ int i;
+ for( i = 0; i < gpu_env->file_count; i++ )
+ {
+ if( strcasecmp( gpu_env->kernelSrcFile[i], cl_file_name ) == 0 )
+ {
+ if( gpu_env->programs[i] != NULL )
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+/**
+ * hb_compile_kernel_file
+ * @param filename -
+ * @param gpu_info -
+ * @param indx -
+ * @param build_option -
+ */
+int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,
+ int indx, const char *build_option )
+{
+ cl_int status;
+ size_t length;
+ char *source_str;
+ const char *source;
+ size_t source_size[1];
+ char *buildLog = NULL;
+ int b_error, binary_status, binaryExisted;
+ char * binary;
+ cl_uint numDevices;
+ cl_device_id *devices;
+ FILE * fd;
+ FILE * fd1;
+ int idx;
+
+ if( hb_cached_of_kerner_prg( gpu_info, filename ) == 1 )
+ return (1);
+
+ idx = gpu_info->file_count;
+
+#ifdef USE_EXTERNAL_KERNEL
+ status = hb_convert_to_string( filename, &source_str, gpu_info, idx );
+ if( status == 0 ) + return(0); +#else + int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter); + +// char *scale_src; +// status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx); +// if (status != 0) +// kernel_src_size += strlen(scale_src); + + source_str = (char*)malloc( kernel_src_size + 2 ); + strcpy( source_str, kernel_src_scale ); +// strcat( source_str, scale_src ); // + strcat( source_str, kernel_src_yadif_filter ); +#endif + + source = source_str;
+ source_size[0] = strlen( source );
+
+ if ((binaryExisted = hb_binary_generated(gpu_info->context, filename, &fd)) == 1)
+ {
+ status = clGetContextInfo(gpu_info->context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL);
+ if (status != CL_SUCCESS)
+ {
+ hb_log("OpenCL: Unable to get the number of devices in context.");
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc(sizeof(cl_device_id) * numDevices);
+ if (devices == NULL)
+ return 0;
+
+ length = 0;
+ b_error = 0;
+ b_error |= fseek(fd, 0, SEEK_END) < 0;
+ b_error |= (length = ftell(fd)) <= 0;
+ b_error |= fseek(fd, 0, SEEK_SET) < 0;
+ if (b_error)
+ return 0;
+
+ binary = (char*)calloc(length + 2, sizeof(char));
+ if (binary == NULL)
+ return 0;
+
+ b_error |= fread(binary, 1, length, fd) != length;
+#if 0 // this doesn't work under OS X and/or with some non-AMD GPUs
+ if (binary[length-1] != '\n')
+ binary[length++] = '\n;
+#endif
+
+ if (b_error)
+ return 0;
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo(gpu_info->context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL);
+
+ gpu_info->programs[idx] = clCreateProgramWithBinary(gpu_info->context,
+ numDevices,
+ devices,
+ &length,
+ (const unsigned char**)&binary,
+ &binary_status,
+ &status);
+
+ fclose(fd);
+ free(devices);
+ fd = NULL;
+ devices = NULL;
+ }
+ else
+ {
+ /* create a CL program using the kernel source */
+ gpu_info->programs[idx] = clCreateProgramWithSource(
+ gpu_info->context, 1, &source, source_size, &status );
+ }
+
+ if((gpu_info->programs[idx] == (cl_program)NULL) || (status != CL_SUCCESS)){
+ hb_log( "OpenCL: Unable to get list of devices in context." );
+ return(0);
+ }
+
+ /* create a cl program executable for all the devices specified */
+ if( !gpu_info->isUserCreated )
+ {
+ status = clBuildProgram( gpu_info->programs[idx], 1, gpu_info->devices,
+ build_option, NULL, NULL );
+ }
+ else
+ {
+ status = clBuildProgram( gpu_info->programs[idx], 1, &(gpu_info->dev),
+ build_option, NULL, NULL );
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ if( !gpu_info->isUserCreated )
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],
+ gpu_info->devices[0],
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+ }
+ else
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],
+ gpu_info->dev,
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to get GPU build information." );
+ return(0);
+ }
+
+ buildLog = (char*)malloc( length );
+ if( buildLog == (char*)NULL )
+ {
+ return(0);
+ }
+
+ if( !gpu_info->isUserCreated )
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->devices[0],
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+ }
+ else
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->dev,
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+ }
+
+ fd1 = fopen( "kernel-build.log", "w+" );
+ if( fd1 != NULL ) {
+ fwrite( buildLog, sizeof(char), length, fd1 );
+ fclose( fd1 );
+ }
+
+ free( buildLog );
+ return(0);
+ }
+
+ strcpy( gpu_env.kernelSrcFile[idx], filename );
+ + if (binaryExisted != 1) + { + //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename); + } + + gpu_info->file_count += 1; +
+ return(1);
+}
+
+
+/**
+ * hb_get_kernel_env_and_func
+ * @param kernel_name -
+ * @param env -
+ * @param function -
+ */
+int hb_get_kernel_env_and_func( const char *kernel_name,
+ KernelEnv *env,
+ cl_kernel_function *function )
+{
+ int i;
+ for( i = 0; i < gpu_env.kernel_count; i++ )
+ {
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
+ {
+ env->context = gpu_env.context;
+ env->command_queue = gpu_env.command_queue;
+ env->program = gpu_env.programs[0];
+ env->kernel = gpu_env.kernels[i];
+ env->isAMD = ( gpu_env.vendor == AMD ) ? 1 : 0;
+ *function = gpu_env.kernel_functions[i];
+ return(1);
+ }
+ }
+ return(0);
+}
+
+/**
+ * hb_get_kernel_env_and_func
+ * @param kernel_name -
+ * @param userdata -
+ */
+int hb_run_kernel( const char *kernel_name, void **userdata )
+{
+ KernelEnv env;
+ cl_kernel_function function;
+ int status;
+ memset( &env, 0, sizeof(KernelEnv));
+ status = hb_get_kernel_env_and_func( kernel_name, &env, &function );
+ strcpy( env.kernel_name, kernel_name );
+ if( status == 1 )
+ {
+ return(function( userdata, &env ));
+ }
+
+ return(0);
+}
+
+/**
+ * hb_init_opencl_run_env
+ * @param argc -
+ * @param argv -
+ * @param build_option -
+ */
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option )
+{
+ int status = 0;
+ if( MAX_CLKERNEL_NUM <= 0 )
+ {
+ return 1;
+ }
+
+ if((argc > MAX_CLFILE_NUM) || (argc<0))
+ {
+ return 1;
+ }
+
+ if( !isInited )
+ {
+ hb_regist_opencl_kernel();
+
+ /*initialize devices, context, comand_queue*/
+ status = hb_init_opencl_env( &gpu_env );
+ if( status )
+ return(1);
+
+ /*initialize program, kernel_name, kernel_count*/
+ status = hb_compile_kernel_file("hb-opencl-kernels.cl",
+ &gpu_env, 0, build_option);
+
+ if( status == 0 || gpu_env.kernel_count == 0 )
+ {
+ return(1);
+ + } + + useBuffers = 1; + isInited = 1; + } + + return(0);
+}
+
+/**
+ * hb_release_opencl_run_env
+ */
+int hb_release_opencl_run_env()
+{
+ return hb_release_opencl_env( &gpu_env );
+}
+
+/**
+ * hb_opencl_stats
+ */
+int hb_opencl_stats()
+{
+ return isInited;
+}
+
+/**
+ * hb_get_opencl_env
+ */
+int hb_get_opencl_env()
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ cl_device_id *devices;
+
+ /*initialize devices, context, comand_queue*/
+ status = hb_init_opencl_env( &gpu_env );
+ if( status )
+ return(1);
+ status = clGetContextInfo( gpu_env.context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ return 0;
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ return 0;
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo( gpu_env.context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+
+ for (i = 0; i < numDevices; i++)
+ {
+ if (devices[i] != NULL)
+ {
+ char deviceVendor[100], deviceName[1024], driverVersion[1024];
+ clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(deviceVendor),
+ deviceVendor, NULL);
+ clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName),
+ deviceName, NULL);
+ clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(driverVersion),
+ driverVersion, NULL);
+ hb_log("hb_get_opencl_env: GPU #%d, Device Vendor: %s", i + 1, deviceVendor);
+ hb_log("hb_get_opencl_env: GPU #%d, Device Name: %s", i + 1, deviceName);
+ hb_log("hb_get_opencl_env: GPU #%d, Driver Version: %s", i + 1, driverVersion);
+ }
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+
+ return status;
+}
+
+/**
+ * hb_create_buffer
+ * @param cl_inBuf -
+ * @param flags -
+ * @param size -
+ */
+int hb_create_buffer( cl_mem *cl_Buf, int flags, int size )
+{
+ int status;
+ *cl_Buf = clCreateBuffer( gpu_env.context, (flags), (size), NULL, &status );
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: clCreateBuffer error '%d'", status );
+ return 0;
+ }
+
+ return 1;
+}
+
+
+/**
+ * hb_read_opencl_buffer
+ * @param cl_inBuf -
+ * @param outbuf -
+ * @param size -
+ */
+int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size )
+{
+ int status;
+
+ status = clEnqueueReadBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE, 0, size, outbuf, 0, 0, 0 );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: av_read_opencl_buffer error '%d'", status );
+ return 0;
+ }
+
+ return 1; +} + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size) +{ + int status; + int flags = CL_MEM_ALLOC_HOST_PTR; + //cl_event event; + *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status); + *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status); + + //hb_log("\t **** context: %.8x cmdqueue: %.8x cl_mem: %.8x mapaddr: %.8x size: %d status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status); + + return (status == CL_SUCCESS) ? 1 : 0; +} + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr) +{ + cl_event event; + int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event); + if (status == CL_SUCCESS) + clWaitForEvents(1, &event); + else + hb_log("hb_free_mapped_buffer: error %d", status); + return (status == CL_SUCCESS) ? 1 : 0; +} + +void hb_opencl_init() +{ + hb_get_opencl_env(); +} + +int hb_use_buffers() +{ + return useBuffers; +} + +int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb) +{ + int status = clEnqueueCopyBuffer(gpu_env.command_queue, + src_buffer,
+ dst_buffer,
+ src_offset, dst_offset, cb,
+ 0, 0, 0);
+ if( status != CL_SUCCESS )
+ {
+ av_log(NULL,AV_LOG_ERROR, "hb_read_opencl_buffer error '%d'\n", status );
+ return 0;
+ }
+ return 1;
+}
+
+int hb_read_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height)
+{
+
+ int chrH = -(-height >> 1);
+ unsigned char *temp = (unsigned char *)av_malloc(sizeof(uint8_t) * (linesize0 * height + linesize1 * chrH * 2));
+ if(hb_read_opencl_buffer(cl_inBuf,temp,sizeof(uint8_t)*(linesize0 + linesize1)*height))
+ {
+ memcpy(Ybuf,temp,linesize0 * height);
+ memcpy(Ubuf,temp + linesize0 * height,linesize1 *chrH);
+ memcpy(Vbuf,temp + linesize0 * height + linesize1 * chrH,linesize2 * chrH);
+
+ }
+ av_free(temp);
+
+ return 1;
+}
+
+int hb_write_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height,int offset)
+{
+ int status;
+ void *mapped = clEnqueueMapBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * (linesize0 + linesize1)*height + offset, 0, NULL, NULL, NULL );
+ uint8_t *temp = (uint8_t *)mapped;
+ temp += offset;
+ memcpy(temp,Ybuf,sizeof(uint8_t) * linesize0 * height);
+ memcpy(temp + sizeof(uint8_t) * linesize0 * height,Ubuf,sizeof(uint8_t) * linesize1 * height/2);
+ memcpy(temp + sizeof(uint8_t) * (linesize0 * height + linesize1 * height/2),Vbuf,sizeof(uint8_t) * linesize2 * height/2);
+ clEnqueueUnmapMemObject(gpu_env.command_queue, cl_inBuf, mapped, 0, NULL, NULL );
+ return 1;
+}
+
+cl_command_queue hb_get_command_queue()
+{
+ return gpu_env.command_queue;
+}
+
+cl_context hb_get_context()
+{
+ return gpu_env.context;
+}
+#endif
diff --git a/libhb/openclwrapper.h b/libhb/openclwrapper.h new file mode 100644 index 000000000..8436c3a9c --- /dev/null +++ b/libhb/openclwrapper.h @@ -0,0 +1,89 @@ +/* openclwrapper.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+#ifndef __OPENCL_WRAPPER_H
+#define __OPENCL_WRAPPER_H
+#ifdef USE_OPENCL
+#include "common.h"
+
+//support AMD opencl
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+
+typedef struct _KernelEnv
+{
+ cl_context context;
+ cl_command_queue command_queue;
+ cl_program program;
+ cl_kernel kernel;
+ char kernel_name[150];
+ int isAMD;
+}KernelEnv;
+
+typedef struct _OpenCLEnv
+{
+ cl_platform_id platform;
+ cl_context context;
+ cl_device_id devices;
+ cl_command_queue command_queue;
+}OpenCLEnv;
+
+
+//user defined, this is function wrapper which is used to set the input parameters ,
+//luanch kernel and copy data from GPU to CPU or CPU to GPU.
+typedef int (*cl_kernel_function)( void **userdata, KernelEnv *kenv );
+
+// registe a wapper for running the kernel specified by the kernel name
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function );
+
+// run kernel , user call this function to luanch kernel.
+// kernel_name: this kernel name is used to find the kernel in opencl runtime environment
+// userdata: this userdata is the all parameters for running the kernel specified by kernel name
+int hb_run_kernel( const char *kernel_name, void **userdata );
+
+// init the run time environment , this function must be called befor calling any function related to opencl
+// the argc must be set zero , argv must be set NULL, build_option is the options for build the kernel.
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option );
+
+//relase all resource about the opencl , this function must be called after calling any functions related to opencl
+int hb_release_opencl_run_env();
+
+// get the opencl status , 0: not init ; 1, inited; this function is used the check whether or not the opencl run time has been created
+int hb_opencl_stats();
+
+// update opencl run time environments , such as commandqueue , platforme, context. program
+int hb_init_opencl_attr( OpenCLEnv * env );
+
+// create kernel object by a kernel name on the specified opencl run time indicated by env parameter
+int hb_create_kernel( char * kernelname, KernelEnv * env );
+
+// release kernel object which is generated by calling the hb_create_kernel api +int hb_release_kernel( KernelEnv * env ); + +void hb_opencl_init(); + +int hb_get_opencl_env(); + +int hb_create_buffer(cl_mem *cl_Buf,int flags,int size); + +int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size); + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size); + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr); + +int hb_use_buffers(); + +int hb_confirm_gpu_type(); +#endif +#endif diff --git a/libhb/scale.c b/libhb/scale.c new file mode 100644 index 000000000..9dcff28ac --- /dev/null +++ b/libhb/scale.c @@ -0,0 +1,1020 @@ +/* scale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifdef USE_OPENCL
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hb.h"
+#include "scale.h"
+#include "scale_kernel.h"
+#include "libavutil/pixdesc.h"
+
+#define isScaleRGBinInt(x) \
+ ( \
+ (x)==AV_PIX_FMT_RGB48BE || \
+ (x)==AV_PIX_FMT_RGB48LE || \
+ (x)==AV_PIX_FMT_RGB32 || \
+ (x)==AV_PIX_FMT_RGB32_1 || \
+ (x)==AV_PIX_FMT_RGB24 || \
+ (x)==AV_PIX_FMT_RGB565BE || \
+ (x)==AV_PIX_FMT_RGB565LE || \
+ (x)==AV_PIX_FMT_RGB555BE || \
+ (x)==AV_PIX_FMT_RGB555LE || \
+ (x)==AV_PIX_FMT_RGB444BE || \
+ (x)==AV_PIX_FMT_RGB444LE || \
+ (x)==AV_PIX_FMT_RGB8 || \
+ (x)==AV_PIX_FMT_RGB4 || \
+ (x)==AV_PIX_FMT_RGB4_BYTE || \
+ (x)==AV_PIX_FMT_MONOBLACK || \
+ (x)==AV_PIX_FMT_MONOWHITE \
+ )
+#define isScaleBGRinInt(x) \
+ ( \
+ (x)==AV_PIX_FMT_BGR48BE || \
+ (x)==AV_PIX_FMT_BGR48LE || \
+ (x)==AV_PIX_FMT_BGR32 || \
+ (x)==AV_PIX_FMT_BGR32_1 || \
+ (x)==AV_PIX_FMT_BGR24 || \
+ (x)==AV_PIX_FMT_BGR565BE || \
+ (x)==AV_PIX_FMT_BGR565LE || \
+ (x)==AV_PIX_FMT_BGR555BE || \
+ (x)==AV_PIX_FMT_BGR555LE || \
+ (x)==AV_PIX_FMT_BGR444BE || \
+ (x)==AV_PIX_FMT_BGR444LE || \
+ (x)==AV_PIX_FMT_BGR8 || \
+ (x)==AV_PIX_FMT_BGR4 || \
+ (x)==AV_PIX_FMT_BGR4_BYTE|| \
+ (x)==AV_PIX_FMT_MONOBLACK|| \
+ (x)==AV_PIX_FMT_MONOWHITE \
+ )
+
+#define isScaleAnyRGB(x) \
+ ( \
+ isScaleRGBinInt(x) || \
+ isScaleBGRinInt(x) \
+ )
+
+#define isScaleGray(x) \
+ ((x) == AV_PIX_FMT_GRAY8 || \
+ (x) == AV_PIX_FMT_Y400A || \
+ (x) == AV_PIX_FMT_GRAY16BE || \
+ (x) == AV_PIX_FMT_GRAY16LE)
+
+static ScaleContext *g_scale;
+
+static double getScaleSplineCoeff(double a, double b, double c, double d, double dist)
+{
+ if (dist <= 1.0)
+ return ((d * dist + c) * dist + b) * dist + a;
+ else
+ return getScaleSplineCoeff(0.0,
+ b + 2.0 * c + 3.0 * d,
+ c + 3.0 * d,
+ -b - 3.0 * c - 6.0 * d,
+ dist - 1.0);
+}
+
+static int initScaleFilter(int16_t **outFilter, int32_t **filterPos,
+ int *outFilterSize, int xInc, int srcW, int dstW,
+ int filterAlign, int one, int flags, int cpu_flags,
+ ScaleVector *srcFilter, ScaleVector *dstFilter,
+ double param[2])
+{
+ int i;
+ int filterSize;
+ int filter2Size;
+ int minFilterSize;
+ int64_t *filter = NULL;
+ int64_t *filter2 = NULL;
+ const int64_t fone = 1LL << 54;
+ int ret = -1;
+
+ *filterPos = (int32_t *)av_malloc((dstW + 3) * sizeof(**filterPos));
+ if (*filterPos == NULL && ((dstW + 3) * sizeof(**filterPos)) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ if (FFABS(xInc - 0x10000) < 10)
+ { // unscaled
+ int i;
+ filterSize = 1;
+ filter = (int64_t *)av_mallocz(dstW * sizeof(*filter) * filterSize);
+ if (filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ filter[i * filterSize] = fone;
+ (*filterPos)[i] = i;
+ }
+ }
+ else if (flags & SWS_POINT)
+ { // lame looking point sampling mode
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 1;
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+
+ (*filterPos)[i] = xx;
+ filter[i] = fone;
+ xDstInSrc += xInc;
+ }
+ }
+ else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
+ (flags & SWS_FAST_BILINEAR))
+ { // bilinear upscale
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 2;
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+ int j;
+
+ (*filterPos)[i] = xx;
+ // bilinear upscale / linear interpolate / area averaging
+ for (j = 0; j < filterSize; j++)
+ {
+ int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
+ if (coeff < 0)
+ coeff = 0;
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += xInc;
+ }
+ }
+ else
+ {
+ int64_t xDstInSrc;
+ int sizeFactor;
+
+ if (flags & SWS_BICUBIC)
+ sizeFactor = 4;
+ else if (flags & SWS_X)
+ sizeFactor = 8;
+ else if (flags & SWS_AREA)
+ sizeFactor = 1; // downscale only, for upscale it is bilinear
+ else if (flags & SWS_GAUSS)
+ sizeFactor = 8; // infinite ;)
+ else if (flags & SWS_LANCZOS)
+ sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;
+ else if (flags & SWS_SINC)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_SPLINE)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_BILINEAR)
+ sizeFactor = 2;
+ else
+ {
+ sizeFactor = 0; // GCC warning killer
+ assert(0);
+ }
+
+ if (xInc <= 1 << 16)
+ filterSize = 1 + sizeFactor; // upscale
+ else
+ filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
+
+
+ filterSize = FFMIN(filterSize, srcW - 2);
+ filterSize = FFMAX(filterSize, 1);
+
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc - 0x10000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 2) << 16)) / (1 << 17);
+ int j;
+ (*filterPos)[i] = xx;
+ for (j = 0; j < filterSize; j++)
+ {
+ int64_t d = (FFABS(((int64_t)xx << 17) - xDstInSrc)) << 13;
+ double floatd;
+ int64_t coeff;
+
+ if (xInc > 1 << 16)
+ d = d * dstW / srcW;
+ floatd = d * (1.0 / (1 << 30));
+
+ if (flags & SWS_BICUBIC)
+ {
+ int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);
+ int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);
+
+ if (d >= 1LL << 31)
+ {
+ coeff = 0.0;
+ }
+ else
+ {
+ int64_t dd = (d * d) >> 30;
+ int64_t ddd = (dd * d) >> 30;
+
+ if (d < 1LL << 30)
+ coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +
+ (-18 * (1 << 24) + 12 * B + 6 * C) * dd +
+ (6 * (1 << 24) - 2 * B) * (1 << 30);
+ else
+ coeff = (-B - 6 * C) * ddd +
+ (6 * B + 30 * C) * dd +
+ (-12 * B - 48 * C) * d +
+ (8 * B + 24 * C) * (1 << 30);
+ }
+ coeff *= fone >> (30 + 24);
+ }
+#if 0
+ else if (flags & SWS_X)
+ {
+ double p = param ? param * 0.01 : 0.3;
+ coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;
+ coeff *= pow (2.0, -p * d * d);
+ }
+#endif
+ else if (flags & SWS_X)
+ {
+ double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
+ double c;
+
+ if (floatd < 1.0)
+ c = cos(floatd * M_PI);
+ else
+ c = -1.0;
+ if (c < 0.0)
+ c = -pow(-c, A);
+ else
+ c = pow(c, A);
+ coeff = (c * 0.5 + 0.5) * fone;
+ }
+ else if (flags & SWS_AREA)
+ {
+ int64_t d2 = d - (1 << 29);
+ if (d2 * xInc < -(1LL << (29 + 16)))
+ coeff = 1.0 * (1LL << (30 + 16));
+ else if (d2 * xInc < (1LL << (29 + 16)))
+ coeff = -d2 * xInc + (1LL << (29 + 16));
+ else
+ coeff = 0.0;
+ coeff *= fone >> (30 + 16);
+ }
+ else if (flags & SWS_GAUSS)
+ {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (pow(2.0, -p * floatd * floatd)) * fone;
+ }
+ else if (flags & SWS_SINC)
+ {
+ coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;
+ }
+ else if (flags & SWS_LANCZOS)
+ {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /
+ (floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;
+ if (floatd > p)
+ coeff = 0;
+ }
+ else if (flags & SWS_BILINEAR)
+ {
+ coeff = (1 << 30) - d;
+ if (coeff < 0)
+ coeff = 0;
+ coeff *= fone >> 30;
+ }
+ else if (flags & SWS_SPLINE)
+ {
+ double p = -2.196152422706632;
+ coeff = getScaleSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
+ }
+ else
+ {
+ coeff = 0.0; // GCC warning killer
+ assert(0);
+ }
+
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += 2 * xInc;
+ }
+ }
+
+ assert(filterSize > 0);
+ filter2Size = filterSize;
+ if (srcFilter)
+ filter2Size += srcFilter->length - 1;
+ if (dstFilter)
+ filter2Size += dstFilter->length - 1;
+ assert(filter2Size > 0);
+ filter2 = (int64_t *)av_mallocz(filter2Size * dstW * sizeof(*filter2));
+ if (filter2 == NULL && (filter2Size * dstW * sizeof(*filter2)) != 0)
+ {
+ hb_log("Can't alloc memory.");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j, k;
+
+ if (srcFilter)
+ {
+ for (k = 0; k < srcFilter->length; k++)
+ {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + k + j] +=
+ srcFilter->coeff[k] * filter[i * filterSize + j];
+ }
+ }
+ else
+ {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + j] = filter[i * filterSize + j];
+ }
+ // FIXME dstFilter
+
+ (*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;
+ }
+ av_freep(&filter);
+
+ // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
+ minFilterSize = 0;
+ for (i = dstW - 1; i >= 0; i--)
+ {
+ int min = filter2Size;
+ int j;
+ int64_t cutOff = 0.0;
+
+ for (j = 0; j < filter2Size; j++)
+ {
+ int k;
+ cutOff += FFABS(filter2[i * filter2Size]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+
+ if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
+ break;
+
+ // move filter coefficients left
+ for (k = 1; k < filter2Size; k++)
+ filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
+ filter2[i * filter2Size + k - 1] = 0;
+ (*filterPos)[i]++;
+ }
+
+ cutOff = 0;
+ for (j = filter2Size - 1; j > 0; j--)
+ {
+ cutOff += FFABS(filter2[i * filter2Size + j]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+ min--;
+ }
+
+ if (min > minFilterSize)
+ minFilterSize = min;
+ }
+
+
+ assert(minFilterSize > 0);
+ filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
+ assert(filterSize > 0);
+ filter = (int64_t *)av_malloc(filterSize * dstW * sizeof(*filter));
+ if (filterSize >= MAX_FILTER_SIZE * 16 /
+ ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
+ goto fail;
+ *outFilterSize = filterSize;
+
+ if (flags & SWS_PRINT_INFO)
+ hb_log("SwScaler: reducing / aligning filtersize %d -> %d",filter2Size,filterSize);
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ if (j >= filter2Size)
+ filter[i * filterSize + j] = 0;
+ else
+ filter[i * filterSize + j] = filter2[i * filter2Size + j];
+ if ((flags & SWS_BITEXACT) && j >= minFilterSize)
+ filter[i * filterSize + j] = 0;
+ }
+ }
+
+ // FIXME try to align filterPos if possible
+
+ // fix borders
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ if ((*filterPos)[i] < 0)
+ {
+ // move filter coefficients left to compensate for filterPos
+ for (j = 1; j < filterSize; j++)
+ {
+ int left = FFMAX(j + (*filterPos)[i], 0);
+ filter[i * filterSize + left] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= 0;
+ }
+
+ if ((*filterPos)[i] + filterSize > srcW)
+ {
+ int shift = (*filterPos)[i] + filterSize - srcW;
+ // move filter coefficients right to compensate for filterPos
+ for (j = filterSize - 2; j >= 0; j--)
+ {
+ int right = FFMIN(j + shift, filterSize - 1);
+ filter[i * filterSize + right] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= srcW - filterSize;
+ }
+ }
+
+ // Note the +1 is for the MMX scaler which reads over the end
+ // FF_ALLOCZ_OR_GOTO(NULL, *outFilter,
+ // *outFilterSize * (dstW + 3) * sizeof(int16_t), fail);
+ *outFilter = (int16_t *)av_mallocz(*outFilterSize * (dstW + 3) * sizeof(int16_t));
+ if( *outFilter == NULL && (*outFilterSize * (dstW + 3) * sizeof(int16_t)) != 0)
+ {
+ hb_log("Can't alloc memory");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int64_t error = 0;
+ int64_t sum = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ sum += filter[i * filterSize + j];
+ }
+ sum = (sum + one / 2) / one;
+ for (j = 0; j < *outFilterSize; j++)
+ {
+ int64_t v = filter[i * filterSize + j] + error;
+ int intV = ROUNDED_DIV(v, sum);
+ (*outFilter)[i * (*outFilterSize) + j] = intV;
+ error = v - intV * sum;
+ }
+ }
+
+ (*filterPos)[dstW + 0] =
+ (*filterPos)[dstW + 1] =
+ (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1];
+ for (i = 0; i < *outFilterSize; i++)
+ {
+ int k = (dstW - 1) * (*outFilterSize) + i;
+ (*outFilter)[k + 1 * (*outFilterSize)] =
+ (*outFilter)[k + 2 * (*outFilterSize)] =
+ (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
+ }
+
+ ret = 0;
+
+fail:
+ av_free(filter);
+ av_free(filter2);
+ return ret;
+}
+
+static int handle_scale_jpeg(enum PixelFormat *format)
+{
+ switch (*format)
+ {
+ case AV_PIX_FMT_YUVJ420P:
+ *format = AV_PIX_FMT_YUV420P;
+ return 1;
+ case AV_PIX_FMT_YUVJ422P:
+ *format = AV_PIX_FMT_YUV422P;
+ return 1;
+ case AV_PIX_FMT_YUVJ444P:
+ *format = AV_PIX_FMT_YUV444P;
+ return 1;
+ case AV_PIX_FMT_YUVJ440P:
+ *format = AV_PIX_FMT_YUV440P;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static void scaleGetSubSampleFactors(int *h, int *v, enum PixelFormat format)
+{
+ *h = av_pix_fmt_descriptors[format].log2_chroma_w;
+ *v = av_pix_fmt_descriptors[format].log2_chroma_h;
+}
+
+typedef struct FormatEntry {
+ int is_supported_in, is_supported_out;
+} FormatEntry;
+
+static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 0 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 },
+ { 0, 1 }, { 1, 1 }, { 1, 1 }, { 0, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 0 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 0, 0 },
+ { 0, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 },
+ { 1, 0 }, { 1, 0 },
+};
+
+int scale_isSupportedInput( enum PixelFormat pix_fmt )
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_in : 0;
+}
+
+int scale_isSupportedOutput( enum PixelFormat pix_fmt )
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_out : 0;
+}
+
+static void hcscale_fast_c( ScaleContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc )
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++)
+ {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
+ dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+
+static void hyscale_fast_c(ScaleContext *c, int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++)
+ {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+static void hScale16To19_c(ScaleContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ const uint16_t *src = (const uint16_t *) _src;
+ int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+ int sh = bits - 4;
+
+ if((isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8)
+ && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+ sh= 9;
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
+ }
+}
+
+static void hScale16To15_c(ScaleContext *c, int16_t *dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ const uint16_t *src = (const uint16_t *) _src;
+ int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
+ if(sh<15)
+ sh= isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8
+ ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
+ }
+}
+
+static void hScale8To15_c(ScaleContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++)
+ {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
+ }
+}
+
+static void hScale8To19_c(ScaleContext *c, int16_t *_dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++)
+ {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
+ }
+}
+
+static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
+}
+
+static void lumRangeFromJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i] * 14071 + 33561947) >> 14;
+}
+
+static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12;
+}
+
+static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
+}
+
+static av_cold void sws_init_swScale_c(ScaleContext *c)
+{
+ enum PixelFormat srcFormat = c->srcFormat;
+
+ ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX);
+
+ ff_sws_init_input_funcs(c);
+
+ if (c->srcBpc == 8)
+ {
+ if (c->dstBpc <= 10)
+ {
+ c->hyScale = c->hcScale = hScale8To15_c;
+ if (c->flags & SWS_FAST_BILINEAR)
+ {
+ c->hyscale_fast = hyscale_fast_c;
+ c->hcscale_fast = hcscale_fast_c;
+ }
+ }
+ else
+ {
+ c->hyScale = c->hcScale = hScale8To19_c;
+ }
+ }
+ else
+ {
+ c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c
+ : hScale16To15_c;
+ }
+
+ if (c->srcRange != c->dstRange && !isScaleAnyRGB(c->dstFormat))
+ {
+ if (c->dstBpc <= 10)
+ {
+ if (c->srcRange)
+ {
+ c->lumConvertRange = lumRangeFromJpeg_c;
+ c->chrConvertRange = chrRangeFromJpeg_c;
+ }
+ else
+ {
+ c->lumConvertRange = lumRangeToJpeg_c;
+ c->chrConvertRange = chrRangeToJpeg_c;
+ }
+ }
+ else
+ {
+ if (c->srcRange)
+ {
+ c->lumConvertRange = lumRangeFromJpeg16_c;
+ c->chrConvertRange = chrRangeFromJpeg16_c;
+ }
+ else
+ {
+ c->lumConvertRange = lumRangeToJpeg16_c;
+ c->chrConvertRange = chrRangeToJpeg16_c;
+ }
+ }
+ }
+
+ if (!(isScaleGray(srcFormat) || isScaleGray(c->dstFormat) ||
+ srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
+ c->needs_hcscale = 1;
+}
+
+int scale_init_context(ScaleContext *c, ScaleFilter *srcFilter, ScaleFilter *dstFilter)
+{
+ ScaleFilter dummyFilter = { NULL, NULL, NULL, NULL };
+ int srcW = c->srcW;
+ int srcH = c->srcH;
+ int dstW = c->dstW;
+ int dstH = c->dstH;
+ int flags, cpu_flags;
+ enum PixelFormat srcFormat = c->srcFormat;
+ enum PixelFormat dstFormat = c->dstFormat;
+
+ cpu_flags = 0;
+ flags = c->flags;
+
+ if(srcFormat != c->srcFormat || dstFormat != c->dstFormat)
+ {
+ hb_log("deprecated pixel format used, make sure you did set range correctly.");
+ c->srcFormat = srcFormat;
+ c->dstFormat = dstFormat;
+ }
+
+ if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1)
+ {
+ hb_log("%dx%d -> %dx%d is invalid scaling dimension.",srcW,srcH,dstW,dstH);
+ return -1;
+ }
+
+ if (!dstFilter)
+ dstFilter = &dummyFilter;
+ if (!srcFilter)
+ srcFilter = &dummyFilter;
+
+ c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+ c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+ c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[dstFormat]);
+ c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[srcFormat]);
+ c->vRounder = 4 * 0x0001000100010001ULL;
+
+ scaleGetSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
+ scaleGetSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
+
+ // drop some chroma lines if the user wants it
+ c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >> SWS_SRC_V_CHR_DROP_SHIFT;
+ c->chrSrcVSubSample += c->vChrDrop;
+ c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample);
+ c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample);
+ c->chrDstW = -((-dstW) >> c->chrDstHSubSample);
+ c->chrDstH = -((-dstH) >> c->chrDstVSubSample);
+ c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;
+ c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;
+
+ const int filterAlign = 1;
+
+ if (initScaleFilter(&c->hLumFilter, &c->hLumFilterPos,
+ &c->hLumFilterSize, c->lumXInc,
+ srcW, dstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumH, dstFilter->lumH,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->hChrFilter, &c->hChrFilterPos,
+ &c->hChrFilterSize, c->chrXInc,
+ c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrH, dstFilter->chrH,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,
+ c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumV, dstFilter->lumV,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
+ c->chrYInc, c->chrSrcH, c->chrDstH,
+ filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrV, dstFilter->chrV,
+ c->param) < 0)
+ goto fail;
+ return 0;
+fail:
+ return -1;
+}
+
+ScaleContext *scale_getContext(int srcW, int srcH, enum PixelFormat srcFormat,
+ int dstW, int dstH, enum PixelFormat dstFormat,
+ int flags, ScaleFilter *srcFilter,
+ ScaleFilter *dstFilter, const double *param)
+{
+ ScaleContext *sc = (ScaleContext*)malloc(sizeof(ScaleContext));
+ sc->flags = flags;
+ sc->srcW = srcW;
+ sc->srcH = srcH;
+ sc->dstW = dstW;
+ sc->dstH = dstH;
+ sc->srcRange = handle_scale_jpeg(&srcFormat);
+ sc->dstRange = handle_scale_jpeg(&dstFormat);
+ sc->srcFormat = srcFormat;
+ sc->dstFormat = dstFormat;
+ sc->hyscale_fast = 0;
+ sc->hcscale_fast = 0;
+
+ if (param)
+ {
+ sc->param[0] = param[0];
+ sc->param[1] = param[1];
+ }
+
+ if (scale_init_context(sc, srcFilter, dstFilter) < 0)
+ {
+ sws_freeContext(sc);
+ return NULL;
+ }
+ return sc;
+}
+
+int scale_opencl(ScaleContext *c, void *cl_inbuf, void *cl_outbuf, int *srcStride, int *dstStride)
+{
+ int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
+ av_scale_frame(c,cl_outbuf,cl_inbuf,srcStride,dstStride,&should_dither);
+ return 1;
+}
+
+void scale_init( int width, int height, int dstwidth, int dstheight )
+{
+ int srcW = width;
+ int srcH = height;
+ int dstW = dstwidth;
+ int dstH = dstheight;
+ enum PixelFormat inputfmt = AV_PIX_FMT_YUV420P;
+ enum PixelFormat outputfmt = AV_PIX_FMT_YUV420P;
+ int flags = SWS_BILINEAR;
+ g_scale = scale_getContext(srcW,srcH,inputfmt,dstW,dstH,outputfmt,flags,NULL,NULL,NULL);
+}
+
+void scale_release()
+{
+ sws_freeContext( g_scale );
+}
+#ifdef USE_OPENCL
+int scale_run( cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height )
+{
+ g_scale->cl_src = inbuf;
+ g_scale->cl_dst = outbuf;
+
+ int src_stride[4] = { linesizey, linesizeuv, linesizeuv, 0 };
+ int dst_stride[4] = { g_scale->dstW, g_scale->chrDstW, g_scale->chrDstW, 0 };
+ int ret = -1;
+
+ ret = scale_opencl( g_scale, inbuf, outbuf, src_stride, dst_stride );
+
+ return ret;
+}
+#endif
+#endif
diff --git a/libhb/scale.h b/libhb/scale.h new file mode 100644 index 000000000..46c177efc --- /dev/null +++ b/libhb/scale.h @@ -0,0 +1,324 @@ +/* scale.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifndef SCALE_H
+#define SCALE_H
+#ifdef USE_OPENCL
+#include <stdint.h>
+#include "vadxva2.h"
+#include "libavutil/pixfmt.h"
+#include "hbffmpeg.h"
+
+#define YUVRGB_TABLE_HEADROOM 128
+#define MAX_FILTER_SIZE 256
+#define is16BPS(x) \
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 15)
+
+#define is9_OR_10BPS(x) \
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 8 || \
+ av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 9)
+
+#if ARCH_X86_64
+# define APCK_PTR2 8
+# define APCK_COEF 16
+# define APCK_SIZE 24
+#else
+# define APCK_PTR2 4
+# define APCK_COEF 8
+# define APCK_SIZE 16
+#endif
+
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+typedef void (*yuv2interleavedX_fn)(struct ScaleContext *c,
+ const int16_t *chrFilter,
+ int chrFilterSize,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ uint8_t *dest, int dstW);
+
+typedef void (*yuv2packed1_fn)(struct ScaleContext *c, const int16_t *lumSrc,
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc, uint8_t *dest,
+ int dstW, int uvalpha, int y);
+
+typedef void (*yuv2packed2_fn)(struct SCaleContext *c, const int16_t *lumSrc[2],
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc[2],
+ uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y);
+
+typedef void (*yuv2packedX_fn)(struct SCaleContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y);
+
+typedef int (*SwsFunc)(struct ScaleContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]);
+
+typedef struct {
+ double *coeff; ///< pointer to the list of coefficients
+ int length; ///< number of coefficients in the vector
+} ScaleVector;
+
+typedef struct {
+ ScaleVector *lumH;
+ ScaleVector *lumV;
+ ScaleVector *chrH;
+ ScaleVector *chrV;
+} ScaleFilter;
+
+typedef struct ScaleContext {
+ SwsFunc swScale;
+ int srcW; ///< Width of source luma/alpha planes.
+ int srcH; ///< Height of source luma/alpha planes.
+ int dstH; ///< Height of destination luma/alpha planes.
+ int chrSrcW; ///< Width of source chroma planes.
+ int chrSrcH; ///< Height of source chroma planes.
+ int chrDstW; ///< Width of destination chroma planes.
+ int chrDstH; ///< Height of destination chroma planes.
+ int lumXInc, chrXInc;
+ int lumYInc, chrYInc;
+ enum PixelFormat dstFormat; ///< Destination pixel format.
+ enum PixelFormat srcFormat; ///< Source pixel format.
+ int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.
+ int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.
+ int dstBpc, srcBpc;
+ int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.
+ int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.
+ int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
+ int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.
+ int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
+ int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
+ double param[2]; ///< Input parameters for scaling algorithms that need them.
+
+ uint32_t pal_yuv[256];
+ uint32_t pal_rgb[256];
+
+ int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
+ int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
+ int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
+ int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
+ int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
+ int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.
+ int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
+ int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.
+
+ uint8_t *formatConvBuffer;
+ int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.
+ int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
+ int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
+ int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
+ int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
+ int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
+ int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
+ int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
+ int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
+ int vChrFilterSize; ///< Vertical filter size for chroma pixels.
+
+ int lumMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for luma/alpha planes.
+ int chrMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for chroma planes.
+ uint8_t *lumMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for luma/alpha planes.
+ uint8_t *chrMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for chroma planes.
+
+ int canMMX2BeUsed;
+
+ unsigned char *dest;
+ unsigned char *source;
+
+ int dstY; ///< Last destination vertical line output from last slice.
+ int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
+ void *yuvTable; ///<s pointer to the yuv->rgb table start so it can be freed()
+ uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+ int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+
+ //Colorspace stuff
+ int contrast, brightness, saturation; // for sws_getColorspaceDetails
+ int srcColorspaceTable[4];
+ int dstColorspaceTable[4];
+ int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).
+ int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+ int src0Alpha;
+ int dst0Alpha;
+ int yuv2rgb_y_offset;
+ int yuv2rgb_y_coeff;
+ int yuv2rgb_v2r_coeff;
+ int yuv2rgb_v2g_coeff;
+ int yuv2rgb_u2g_coeff;
+ int yuv2rgb_u2b_coeff;
+
+#define RED_DITHER "0*8"
+#define GREEN_DITHER "1*8"
+#define BLUE_DITHER "2*8"
+#define Y_COEFF "3*8"
+#define VR_COEFF "4*8"
+#define UB_COEFF "5*8"
+#define VG_COEFF "6*8"
+#define UG_COEFF "7*8"
+#define Y_OFFSET "8*8"
+#define U_OFFSET "9*8"
+#define V_OFFSET "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
+#define ESP_OFFSET "11*8+4*4*256*2+8"
+#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
+#define U_TEMP "11*8+4*4*256*2+24"
+#define V_TEMP "11*8+4*4*256*2+32"
+#define Y_TEMP "11*8+4*4*256*2+40"
+#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
+#define UV_OFF_PX "11*8+4*4*256*3+48"
+#define UV_OFF_BYTE "11*8+4*4*256*3+56"
+#define DITHER16 "11*8+4*4*256*3+64"
+#define DITHER32 "11*8+4*4*256*3+80"
+
+ DECLARE_ALIGNED(8, uint64_t, redDither);
+ DECLARE_ALIGNED(8, uint64_t, greenDither);
+ DECLARE_ALIGNED(8, uint64_t, blueDither);
+
+ DECLARE_ALIGNED(8, uint64_t, yCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vrCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ubCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vgCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ugCoeff);
+ DECLARE_ALIGNED(8, uint64_t, yOffset);
+ DECLARE_ALIGNED(8, uint64_t, uOffset);
+ DECLARE_ALIGNED(8, uint64_t, vOffset);
+ int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
+ int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
+ int dstW; ///< Width of destination luma/alpha planes.
+ DECLARE_ALIGNED(8, uint64_t, esp);
+ DECLARE_ALIGNED(8, uint64_t, vRounder);
+ DECLARE_ALIGNED(8, uint64_t, u_temp);
+ DECLARE_ALIGNED(8, uint64_t, v_temp);
+ DECLARE_ALIGNED(8, uint64_t, y_temp);
+ int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
+
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
+ DECLARE_ALIGNED(8, uint16_t, dither16)[8];
+ DECLARE_ALIGNED(8, uint32_t, dither32)[8];
+
+ const uint8_t *chrDither8, *lumDither8;
+
+#if HAVE_ALTIVEC
+ vector signed short CY;
+ vector signed short CRV;
+ vector signed short CBU;
+ vector signed short CGU;
+ vector signed short CGV;
+ vector signed short OY;
+ vector unsigned short CSHIFT;
+ vector signed short *vYCoeffsBank, *vCCoeffsBank;
+#endif
+
+#if ARCH_BFIN
+ DECLARE_ALIGNED(4, uint32_t, oy);
+ DECLARE_ALIGNED(4, uint32_t, oc);
+ DECLARE_ALIGNED(4, uint32_t, zero);
+ DECLARE_ALIGNED(4, uint32_t, cy);
+ DECLARE_ALIGNED(4, uint32_t, crv);
+ DECLARE_ALIGNED(4, uint32_t, rmask);
+ DECLARE_ALIGNED(4, uint32_t, cbu);
+ DECLARE_ALIGNED(4, uint32_t, bmask);
+ DECLARE_ALIGNED(4, uint32_t, cgu);
+ DECLARE_ALIGNED(4, uint32_t, cgv);
+ DECLARE_ALIGNED(4, uint32_t, gmask);
+#endif
+
+#if HAVE_VIS
+ DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
+#endif
+ int use_mmx_vfilter;
+
+ /* function pointers for swScale() */
+ yuv2planar1_fn yuv2plane1;
+ yuv2planarX_fn yuv2planeX;
+ yuv2interleavedX_fn yuv2nv12cX;
+ yuv2packed1_fn yuv2packed1;
+ yuv2packed2_fn yuv2packed2;
+ yuv2packedX_fn yuv2packedX;
+
+ /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
+ void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
+ void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
+ void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+
+ void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width);
+ void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
+ int width);
+
+ void (*hyscale_fast)(struct SwsContext *c,
+ int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc);
+ void (*hcscale_fast)(struct SwsContext *c,
+ int16_t *dst1, int16_t *dst2, int dstWidth,
+ const uint8_t *src1, const uint8_t *src2,
+ int srcW, int xInc);
+
+ void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+ void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+ void (*lumConvertRange)(int16_t *dst, int width);
+ void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
+
+ int needs_hcscale; ///< Set if there are chroma planes to be converted.
+
+ cl_mem cl_hLumFilter;
+ cl_mem cl_hLumFilterPos;
+ cl_mem cl_hChrFilter;
+ cl_mem cl_hChrFilterPos;
+ cl_mem cl_vLumFilter;
+ cl_mem cl_vLumFilterPos;
+ cl_mem cl_vChrFilter;
+ cl_mem cl_vChrFilterPos;
+
+ cl_mem cl_intermediaBuf;
+
+ cl_mem cl_src;
+ cl_mem cl_dst;
+} ScaleContext;
+
+void scale_init(int, int, int, int);
+void scale_release();
+int scale_run(cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height);
+#endif
+#endif
diff --git a/libhb/scale_kernel.c b/libhb/scale_kernel.c new file mode 100644 index 000000000..ddcfed914 --- /dev/null +++ b/libhb/scale_kernel.c @@ -0,0 +1,223 @@ +/* scale_kernel.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifdef USE_OPENCL
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "scale.h"
+#include "openclwrapper.h"
+
+#define OCLCHECK( method, ...) \
+ status = method(__VA_ARGS__); if(status != CL_SUCCESS) { \
+ hb_log(" error %s %d",# method, status); assert(0); return status; }
+
+#define CREATEBUF( out, flags, size, ptr)\
+ out = clCreateBuffer( kenv->context, (flags), (size), ptr, &status );\
+ if( status != CL_SUCCESS ) { hb_log( "clCreateBuffer faild %d", status ); return -1; }
+
+ #define CL_PARAM_NUM 20
+
+/****************************************************************************************************************************/
+/*************************Combine the hscale and yuv2plane into scaling******************************************************/
+/****************************************************************************************************************************/
+static int CreateCLBuffer( ScaleContext *c, KernelEnv *kenv )
+{
+ cl_int status;
+
+ if(!c->hyscale_fast || !c->hcscale_fast)
+ {
+ CREATEBUF(c->cl_hLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*c->hLumFilterSize*sizeof(cl_short), c->hLumFilter);
+ CREATEBUF(c->cl_hLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*sizeof(cl_int), c->hLumFilterPos);
+ CREATEBUF(c->cl_hChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*c->hChrFilterSize*sizeof(cl_short), c->hChrFilter);
+ CREATEBUF(c->cl_hChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*sizeof(cl_int), c->hChrFilterPos);
+ }
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ {
+ CREATEBUF(c->cl_vLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*c->vLumFilterSize*sizeof(cl_short), c->vLumFilter);
+ CREATEBUF(c->cl_vChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*c->vChrFilterSize*sizeof(cl_short), c->vChrFilter);
+ }
+ CREATEBUF(c->cl_vLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*sizeof(cl_int), c->vLumFilterPos);
+ CREATEBUF(c->cl_vChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*sizeof(cl_int), c->vChrFilterPos);
+
+ return 1;
+}
+
+int av_scale_frame_func( void **userdata, KernelEnv *kenv )
+{
+ ScaleContext *c = (ScaleContext *)userdata[0];
+
+ c->cl_src = (cl_mem)userdata[2];
+ c->cl_dst = (cl_mem)userdata[1];
+
+ /*frame size*/
+ int *tmp = (int *)userdata[3];
+ int srcStride = tmp[0];
+ int srcChrStride = tmp[1];
+ int srcW = c->srcW;
+ int srcH = c->srcH;
+
+ tmp = (int *)userdata[4];
+ int dstStride = tmp[0];
+ int dstChrStride = tmp[1];
+ int dstW = c->dstW;
+ int dstH = c->dstH;
+
+ /* local variable */
+ cl_int status;
+ size_t global_work_size[2];
+
+ int intermediaSize;
+
+ int st = CreateCLBuffer(c,kenv);
+ if( !st )
+ {
+ hb_log( "CreateBuffer[%s] faild %d", "scale_opencl",st );
+ return -1;
+ }
+
+ intermediaSize = dstStride * srcH + dstChrStride * srcH;
+
+ CREATEBUF(c->cl_intermediaBuf, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, intermediaSize*sizeof(cl_short), NULL);
+
+ static int init_chr_status = 0;
+ static cl_kernel chr_kernel;
+
+ if(init_chr_status == 0){
+
+ if(!(c->flags & 1))
+ {
+ chr_kernel = clCreateKernel( kenv->program, "hscale_all_opencl", NULL );
+ //Set the Kernel Argument;
+ OCLCHECK(clSetKernelArg,chr_kernel, 2, sizeof(cl_mem), (void*)&c->cl_hLumFilter);
+ OCLCHECK(clSetKernelArg,chr_kernel, 3, sizeof(cl_mem), (void*)&c->cl_hLumFilterPos);
+ OCLCHECK(clSetKernelArg,chr_kernel, 4, sizeof(int), (void*)&c->hLumFilterSize);
+ OCLCHECK(clSetKernelArg,chr_kernel, 5, sizeof(cl_mem), (void*)&c->cl_hChrFilter);
+ OCLCHECK(clSetKernelArg,chr_kernel, 6, sizeof(cl_mem), (void*)&c->cl_hChrFilterPos);
+ OCLCHECK(clSetKernelArg,chr_kernel, 7, sizeof(int), (void*)&c->hChrFilterSize);
+ }
+
+ /*Set the arguments*/
+ OCLCHECK(clSetKernelArg, chr_kernel, 8, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, chr_kernel, 9, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, chr_kernel, 10, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, chr_kernel, 11, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, chr_kernel, 12, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 14, sizeof(srcStride), (void*)&srcStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 15, sizeof(srcChrStride), (void*)&srcChrStride);
+ init_chr_status = 1;
+ }
+
+ kenv->kernel = chr_kernel;
+ OCLCHECK(clSetKernelArg, chr_kernel, 0, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);
+ OCLCHECK(clSetKernelArg, chr_kernel, 1, sizeof(cl_mem), (void*)&c->cl_src);
+ /*Run the Kernel*/
+ global_work_size[0] = c->chrDstW;//dstW >> 1; //must times 256;
+ global_work_size[1] = c->chrSrcH;
+
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+ static int init_lum_status = 0;
+ static cl_kernel lum_kernel;
+
+ if( init_lum_status == 0 ){
+ //Vertical:
+ /*Create Kernel*/
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ lum_kernel = clCreateKernel( kenv->program, "vscale_all_nodither_opencl", NULL );
+ else
+ lum_kernel = clCreateKernel( kenv->program, "vscale_fast_opencl", NULL );
+
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ {
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilter);
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(int), (void*)&c->vLumFilterSize);
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(cl_mem), (void*)&c->cl_vChrFilter);
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(int), (void*)&c->vChrFilterSize);
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstH), (void*)&dstH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 12, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 14, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 15, sizeof(dstChrStride), (void*)&dstChrStride);
+ }
+ else
+ {
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(dstH), (void*)&dstH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(dstChrStride), (void*)&dstChrStride);
+ }
+ init_lum_status = 1;
+ }
+
+ kenv->kernel = lum_kernel;
+ OCLCHECK(clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), (void*)&c->cl_dst);
+ OCLCHECK(clSetKernelArg, kenv->kernel, 1, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);
+
+ /*Run the Kernel*/
+ global_work_size[0] = c->chrDstW;
+ global_work_size[1] = c->chrDstH;
+
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL,global_work_size, NULL, 0, NULL, NULL);
+
+ clReleaseMemObject( c->cl_intermediaBuf );
+
+ return 1;
+}
+
+void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither)
+{
+
+ static int regflg = 0;
+ void *userdata[CL_PARAM_NUM];
+ userdata[0] = (void *)c;
+ userdata[1] = (void *)dst;
+ userdata[2] = (void *)src;
+ userdata[3] = (void *)srcStride;
+ userdata[4] = (void *)dstStride;
+ userdata[5] = (void *)should_dither;
+
+ if( regflg==0 )
+ {
+ int st = hb_register_kernel_wrapper( "scale_opencl", av_scale_frame_func);
+ if( !st )
+ {
+ hb_log( "register kernel[%s] faild %d", "scale_opencl",st );
+ return;
+ }
+ regflg++;
+ }
+
+ if( !hb_run_kernel( "scale_opencl", userdata ))
+ {
+ hb_log("run kernel function[%s] faild", "scale_opencl_func" );
+ return;
+ }
+}
+
+#endif
diff --git a/libhb/scale_kernel.h b/libhb/scale_kernel.h new file mode 100644 index 000000000..ce413382f --- /dev/null +++ b/libhb/scale_kernel.h @@ -0,0 +1,20 @@ +/* scale_kernel.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + + */ + +#ifndef _H_SCALE_KERNEL_H +#define _H_SCALE_KERNEL_H +#ifdef USE_OPENCL +void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither); +#endif +#endif diff --git a/libhb/stream.c b/libhb/stream.c index 994cb70de..3512b2d93 100644 --- a/libhb/stream.c +++ b/libhb/stream.c @@ -16,6 +16,7 @@ #include "lang.h" #include "a52dec/a52.h" #include "libbluray/bluray.h" +#include "vadxva2.h" #define min(a, b) a < b ? a : b #define HB_MAX_PROBE_SIZE (1*1024*1024) @@ -49,7 +50,7 @@ typedef struct { static const stream2codec_t st2codec[256] = { st(0x00, U, 0, 0, NULL), st(0x01, V, WORK_DECMPEG2, 0, "MPEG1"), - st(0x02, V, WORK_DECMPEG2, 0, "MPEG2"), + st(0x02, V, WORK_DECMPEG2, AV_CODEC_ID_MPEG2VIDEO, "MPEG2"), st(0x03, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG1"), st(0x04, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG2"), st(0x05, N, 0, 0, "ISO 13818-1 private section"), @@ -609,6 +610,10 @@ static int hb_stream_get_type(hb_stream_t *stream) if ( fread(buf, 1, sizeof(buf), stream->file_handle) == sizeof(buf) ) { +#ifdef USE_HWD + if ( hb_gui_use_hwd_flag == 1 ) + return 0; +#endif int psize; if ( ( psize = hb_stream_check_for_ts(buf) ) != 0 ) { @@ -1096,7 +1101,28 @@ hb_title_t * hb_stream_title_scan(hb_stream_t *stream, hb_title_t * title) { hb_log( "transport stream missing PCRs - using video DTS instead" ); } - +#ifdef USE_HWD + hb_va_dxva2_t * dxva2 = NULL; + dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param ); + if ( dxva2 ) + { + title->hwd_support = 1; + hb_va_close(dxva2); + dxva2 = NULL; + } + else + title->hwd_support = 0; +#else + title->hwd_support = 0; +#endif +#ifdef USE_OPENCL + if ( hb_confirm_gpu_type() == 0 ) + title->opencl_support = 1; + else + title->opencl_support = 0; +#else + title->opencl_support = 0; +#endif // Height, width, rate and aspect ratio information is filled in // when the previews are built return title; @@ -5513,6 +5539,7 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) title->demuxer = HB_NULL_DEMUXER; title->video_codec = 0; int i; + int pix_fmt = -1; for (i = 0; i < ic->nb_streams; ++i ) { if ( ic->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO && @@ -5520,6 +5547,7 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) title->video_codec == 0 ) { AVCodecContext *context = ic->streams[i]->codec; + pix_fmt = context->pix_fmt; if ( context->pix_fmt != AV_PIX_FMT_YUV420P && !sws_isSupportedInput( context->pix_fmt ) ) { @@ -5626,6 +5654,30 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) chapter->seconds = title->seconds; hb_list_add( title->list_chapter, chapter ); } +#ifdef USE_HWD + hb_va_dxva2_t * dxva2 = NULL; + dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param ); + if (dxva2) + { + title->hwd_support = 1; + hb_va_close(dxva2); + dxva2 = NULL; + } + else + title->hwd_support = 0; + if ( hb_check_hwd_fmt(pix_fmt) == 0) + title->hwd_support = 0; +#else + title->hwd_support = 0; +#endif +#ifdef USE_OPENCL + if (hb_confirm_gpu_type() == 0) + title->opencl_support = 1; + else + title->opencl_support = 0; +#else + title->opencl_support = 0; +#endif return title; } diff --git a/libhb/vadxva2.c b/libhb/vadxva2.c new file mode 100644 index 000000000..4137b63c6 --- /dev/null +++ b/libhb/vadxva2.c @@ -0,0 +1,812 @@ +/* vadxva2.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ +#include "vadxva2.h" + +#ifdef USE_OPENCL +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#include "oclnv12toyuv.h" +#include "scale.h" +#endif + +#ifdef USE_HWD + +static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height ); +static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame ); +static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 ); +static void hb_d3d_destroy_device( hb_va_dxva2_t *dxvva2 ); +static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 ); +static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 ); +static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 ); +static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 ); +static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output ); +static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt ); +static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 ); +static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format ); +static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid ); +static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 ); +/** + * It destroys a Direct3D device manager + */ +static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->devmng ) + IDirect3DDeviceManager9_Release( dxva2->devmng ); +} +/** + * It releases a Direct3D device and its resources. + */ +static void hb_d3d_destroy_device( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->d3ddev ) + IDirect3DDevice9_Release( dxva2->d3ddev ); + if( dxva2->d3dobj ) + IDirect3D9_Release( dxva2->d3dobj ); +} +/** + * It destroys a DirectX video service + */ +static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->device ) + IDirect3DDeviceManager9_CloseDeviceHandle( dxva2->devmng, dxva2->device ); + + if( dxva2->vs ) + IDirectXVideoDecoderService_Release( dxva2->vs ); +} + +static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format ) +{ + unsigned i; + for( i = 0; d3d_formats[i].name; i++ ) + { + if( d3d_formats[i].format == format ) + return &d3d_formats[i]; + } + return NULL; +} + +static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 ) +{ + switch( dxva2->render ) + { + case MAKEFOURCC( 'N', 'V', '1', '2' ): + dxva2->output = MAKEFOURCC( 'Y', 'V', '1', '2' ); + break; + default: + dxva2->output = dxva2->render; + break; + } +} + +void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame ) +{ + LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3]; + unsigned i; + for( i = 0; i < dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + if( surface->d3d == d3d ) + surface->refcount--; + } +} + + +void hb_va_close( hb_va_dxva2_t *dxva2 ) +{ + hb_dx_destroy_video_decoder( dxva2 ); + hb_dx_destroy_video_service( dxva2 ); + hb_d3d_destroy_device_manager( dxva2 ); + hb_d3d_destroy_device( dxva2 ); + + if( dxva2->hdxva2_dll ) + FreeLibrary( dxva2->hdxva2_dll ); + if( dxva2->hd3d9_dll ) + FreeLibrary( dxva2->hd3d9_dll ); + +#ifdef USE_OPENCL + if ( dxva2->nv12toyuv_tmp_in ) + free( dxva2->nv12toyuv_tmp_in ); + if ( dxva2->nv12toyuv_tmp_out ) + free( dxva2->nv12toyuv_tmp_out ); +#endif + dxva2->description = NULL; + free( dxva2 ); +} + +/** + * It creates a DXVA2 decoder using the given video format + */ +static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt ) +{ + dxva2->width = fmt->width; + dxva2->height = fmt->height; + dxva2->surface_width = (fmt->width + 15) & ~15; + dxva2->surface_height = (fmt->height + 15) & ~15; + switch( codec_id ) + { + case AV_CODEC_ID_H264: + dxva2->surface_count = 16 + 1; + break; + default: + dxva2->surface_count = 2 + 1; + break; + } + LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT]; + if( FAILED( IDirectXVideoDecoderService_CreateSurface( dxva2->vs, + dxva2->surface_width, + dxva2->surface_height, + dxva2->surface_count - 1, + dxva2->render, + D3DPOOL_DEFAULT, + 0, + DXVA2_VideoDecoderRenderTarget, + surface_list, NULL ))) + { + hb_log( "dxva2:IDirectXVideoAccelerationService_CreateSurface failed" ); + dxva2->surface_count = 0; + return HB_WORK_ERROR; + } + + unsigned i; + for( i = 0; i<dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + surface->d3d = surface_list[i]; + surface->refcount = 0; + surface->order = 0; + } + hb_log( "dxva2:CreateSurface succeed with %d, fmt (%dx%d) surfaces (%dx%d)", dxva2->surface_count, + fmt->width, fmt->height, dxva2->surface_width, dxva2->surface_height ); + DXVA2_VideoDesc dsc; + memset( &dsc, 0, sizeof(dsc)); + dsc.SampleWidth = fmt->width; + dsc.SampleHeight = fmt->height; + dsc.Format = dxva2->render; + + if( fmt->rate> 0 && fmt->rate_base> 0 ) + { + dsc.InputSampleFreq.Numerator = fmt->rate; + dsc.InputSampleFreq.Denominator = fmt->rate_base; + } + else + { + dsc.InputSampleFreq.Numerator = 0; + dsc.InputSampleFreq.Denominator = 0; + } + + dsc.OutputFrameFreq = dsc.InputSampleFreq; + dsc.UABProtectionLevel = FALSE; + dsc.Reserved = 0; + + /* FIXME I am unsure we can let unknown everywhere */ + DXVA2_ExtendedFormat *ext = &dsc.SampleFormat; + ext->SampleFormat = 0; //DXVA2_SampleUnknown; + ext->VideoChromaSubsampling = 0; //DXVA2_VideoChromaSubsampling_Unknown; + ext->NominalRange = 0; //DXVA2_NominalRange_Unknown; + ext->VideoTransferMatrix = 0; //DXVA2_VideoTransferMatrix_Unknown; + ext->VideoLighting = 0; //DXVA2_VideoLighting_Unknown; + ext->VideoPrimaries = 0; //DXVA2_VideoPrimaries_Unknown; + ext->VideoTransferFunction = 0; //DXVA2_VideoTransFunc_Unknown; + + /* List all configurations available for the decoder */ + UINT cfg_count = 0; + DXVA2_ConfigPictureDecode *cfg_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderConfigurations( dxva2->vs, &dxva2->input, &dsc, NULL, &cfg_count, &cfg_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderConfigurations failed" ); + return HB_WORK_ERROR; + } + hb_log( "dxva2:we got %d decoder configurations", cfg_count ); + + /* Select the best decoder configuration */ + int cfg_score = 0; + for( i = 0; i < cfg_count; i++ ) + { + const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i]; + hb_log( "dxva2:configuration[%d] ConfigBitstreamRaw %d", i, cfg->ConfigBitstreamRaw ); + int score; + if( cfg->ConfigBitstreamRaw == 1 ) + score = 1; + else if( codec_id == AV_CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2 ) + score = 2; + else + continue; + if( IsEqualGUID( &cfg->guidConfigBitstreamEncryption, &DXVA_NoEncrypt )) + score += 16; + if( cfg_score < score ) + { + dxva2->cfg = *cfg; + cfg_score = score; + } + } + //my_release(cfg_list); + if( cfg_score <= 0 ) + { + hb_log( "dxva2:Failed to find a supported decoder configuration" ); + return HB_WORK_ERROR; + } + + /* Create the decoder */ + IDirectXVideoDecoder *decoder; + if( FAILED( IDirectXVideoDecoderService_CreateVideoDecoder( dxva2->vs, &dxva2->input, &dsc, &dxva2->cfg, surface_list, dxva2->surface_count, &decoder ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder failed" ); + return HB_WORK_ERROR; + } + dxva2->decoder = decoder; + hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder succeed" ); + return HB_WORK_OK; +} + +typedef HWND (WINAPI *PROCGETSHELLWND)(); +/** + * It creates a DirectX video service + */ +static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 ) +{ + LPDIRECT3D9 (WINAPI *Create9)( UINT SDKVersion ); + Create9 = (void*)GetProcAddress( dxva2->hd3d9_dll, TEXT( "Direct3DCreate9" )); + if( !Create9 ) + { + hb_log( "dxva2:Cannot locate reference to Direct3DCreate9 ABI in DLL" ); + return HB_WORK_ERROR; + } + LPDIRECT3D9 d3dobj; + d3dobj = Create9( D3D_SDK_VERSION ); + if( !d3dobj ) + { + hb_log( "dxva2:Direct3DCreate9 failed" ); + return HB_WORK_ERROR; + } + dxva2->d3dobj = d3dobj; + D3DADAPTER_IDENTIFIER9 *d3dai = &dxva2->d3dai; + if( FAILED( IDirect3D9_GetAdapterIdentifier( dxva2->d3dobj, D3DADAPTER_DEFAULT, 0, d3dai ))) + { + hb_log( "dxva2:IDirect3D9_GetAdapterIdentifier failed" ); + memset( d3dai, 0, sizeof(*d3dai)); + } + + PROCGETSHELLWND GetShellWindow; + HMODULE hUser32 = GetModuleHandle( "user32" ); + GetShellWindow = (PROCGETSHELLWND) + GetProcAddress( hUser32, "GetShellWindow" ); + + D3DPRESENT_PARAMETERS *d3dpp = &dxva2->d3dpp; + memset( d3dpp, 0, sizeof(*d3dpp)); + d3dpp->Flags = D3DPRESENTFLAG_VIDEO; + d3dpp->Windowed = TRUE; + d3dpp->hDeviceWindow = NULL; + d3dpp->SwapEffect = D3DSWAPEFFECT_DISCARD; + d3dpp->MultiSampleType = D3DMULTISAMPLE_NONE; + d3dpp->PresentationInterval = D3DPRESENT_INTERVAL_DEFAULT; + d3dpp->BackBufferCount = 0; /* FIXME what to put here */ + d3dpp->BackBufferFormat = D3DFMT_X8R8G8B8; /* FIXME what to put here */ + d3dpp->BackBufferWidth = 0; + d3dpp->BackBufferHeight = 0; + d3dpp->EnableAutoDepthStencil = FALSE; + + LPDIRECT3DDEVICE9 d3ddev; + //if (FAILED(IDirect3D9_CreateDevice(d3dobj, D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, GetShellWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED, d3dpp, &d3ddev))) + if( FAILED( IDirect3D9_CreateDevice( d3dobj, + D3DADAPTER_DEFAULT, + D3DDEVTYPE_HAL, + GetShellWindow(), + D3DCREATE_HARDWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED, + d3dpp, + &d3ddev ))) + { + hb_log( "dxva2:IDirect3D9_CreateDevice failed" ); + return HB_WORK_ERROR; + } + dxva2->d3ddev = d3ddev; + + return HB_WORK_OK; +} +/** + * It creates a Direct3D device manager + */ +static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 ) +{ + HRESULT(WINAPI *CreateDeviceManager9)( UINT *pResetToken, IDirect3DDeviceManager9 ** ); + CreateDeviceManager9 = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateDirect3DDeviceManager9" )); + + if( !CreateDeviceManager9 ) + { + hb_log( "dxva2:cannot load function" ); + return HB_WORK_ERROR; + } + + UINT token; + IDirect3DDeviceManager9 *devmng; + if( FAILED( CreateDeviceManager9( &token, &devmng ))) + { + hb_log( "dxva2:OurDirect3DCreateDeviceManager9 failed" ); + return HB_WORK_ERROR; + } + dxva2->token = token; + dxva2->devmng = devmng; + + long hr = IDirect3DDeviceManager9_ResetDevice( devmng, dxva2->d3ddev, token ); + if( FAILED( hr )) + { + hb_log( "dxva2:IDirect3DDeviceManager9_ResetDevice failed: %08x", (unsigned)hr ); + return HB_WORK_ERROR; + } + return HB_WORK_OK; +} +/** + * It creates a DirectX video service + */ +static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 ) +{ + HRESULT (WINAPI *CreateVideoService)( IDirect3DDevice9 *, REFIID riid, void **ppService ); + CreateVideoService = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateVideoService" )); + + if( !CreateVideoService ) + { + hb_log( "dxva2:cannot load function" ); + return HB_WORK_ERROR; + } + + HRESULT hr; + + HANDLE device; + hr = IDirect3DDeviceManager9_OpenDeviceHandle( dxva2->devmng, &device ); + if( FAILED( hr )) + { + hb_log( "dxva2:OpenDeviceHandle failed" ); + return HB_WORK_ERROR; + } + dxva2->device = device; + + IDirectXVideoDecoderService *vs; + hr = IDirect3DDeviceManager9_GetVideoService( dxva2->devmng, device, &IID_IDirectXVideoDecoderService, (void*)&vs ); + if( FAILED( hr )) + { + hb_log( "dxva2:GetVideoService failed" ); + return HB_WORK_ERROR; + } + dxva2->vs = vs; + + return HB_WORK_OK; +} +/** + * Find the best suited decoder mode GUID and render format. + */ +static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output ) +{ + unsigned int input_count = 0; + GUID *input_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderDeviceGuids( dxva2->vs, &input_count, &input_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderDeviceGuids failed" ); + return HB_WORK_ERROR; + } + unsigned i, j; + for( i = 0; i < input_count; i++ ) + { + const GUID *g = &input_list[i]; + const hb_dx_mode_t *mode = hb_dx_find_mode( g ); + } + + for( i = 0; dxva2_modes[i].name; i++ ) + { + const hb_dx_mode_t *mode = &dxva2_modes[i]; + if( !mode->codec || mode->codec != dxva2->codec_id ) + continue; + + int is_suported = 0; + const GUID *g; + for( g = &input_list[0]; !is_suported && g < &input_list[input_count]; g++ ) + { + is_suported = IsEqualGUID( mode->guid, g ); + } + if( !is_suported ) + continue; + + unsigned int output_count = 0; + D3DFORMAT *output_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderRenderTargets( dxva2->vs, mode->guid, &output_count, &output_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderRenderTargets failed" ); + continue; + } + for( j = 0; j < output_count; j++ ) + { + const D3DFORMAT f = output_list[j]; + const hb_d3d_format_t *format = hb_d3d_find_format( f ); + if( format ) + { + //hb_log( "dxva2:%s is supported for output", format->name ); + } + else + { + hb_log( "dxvar2:%d is supported for output (%4.4s)", f, (const char*)&f ); + } + } + + for( j = 0; d3d_formats[j].name; j++ ) + { + const hb_d3d_format_t *format = &d3d_formats[j]; + int is_suported = 0; + unsigned k; + for( k = 0; !is_suported && k < output_count; k++ ) + { + is_suported = format->format == output_list[k]; + } + if( !is_suported ) + continue; + *input = *mode->guid; + *output = format->format; + return HB_WORK_OK; + } + } + return HB_WORK_ERROR; +} +static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid ) +{ + unsigned i; + for( i = 0; dxva2_modes[i].name; i++ ) + { + if( IsEqualGUID( dxva2_modes[i].guid, guid )) + return &dxva2_modes[i]; + } + return NULL; +} + + +static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->decoder ) + IDirectXVideoDecoder_Release( dxva2->decoder ); + dxva2->decoder = NULL; + + unsigned i; + for( i = 0; i<dxva2->surface_count; i++ ) + IDirect3DSurface9_Release( dxva2->surface[i].d3d ); + dxva2->surface_count = 0; +} +/** + * setup dxva2 +*/ +static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height ) +{ + if( dxva2->width == width && dxva2->height == height && dxva2->decoder ) + goto ok; + + hb_dx_destroy_video_decoder( dxva2 ); + *hw = NULL; + dxva2->i_chroma = 0; + + if( width <= 0 || height <= 0 ) + return HB_WORK_ERROR; + + hb_title_t fmt; + memset( &fmt, 0, sizeof(fmt)); + fmt.width = width; + fmt.height = height; + + if( hb_dx_create_video_decoder( dxva2, dxva2->codec_id, &fmt ) == HB_WORK_ERROR ) + return HB_WORK_ERROR; + dxva2->hw.decoder = dxva2->decoder; + dxva2->hw.cfg = &dxva2->cfg; + dxva2->hw.surface_count = dxva2->surface_count; + dxva2->hw.surface = dxva2->hw_surface; + + unsigned i; + for( i = 0; i < dxva2->surface_count; i++ ) + dxva2->hw.surface[i] = dxva2->surface[i].d3d; + + hb_dx_create_video_conversion( dxva2 ); + +ok: + *hw = &dxva2->hw; + const hb_d3d_format_t *output = hb_d3d_find_format( dxva2->output ); + dxva2->i_chroma = output->codec; + return HB_WORK_OK; + +} + +static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame ) +{ + unsigned i, old; + for( i = 0, old = 0; i < dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + if( !surface->refcount ) + break; + if( surface->order < dxva2->surface[old].order ) + old = i; + } + if( i >= dxva2->surface_count ) + i = old; + + hb_va_surface_t *surface = &dxva2->surface[i]; + + surface->refcount = 1; + surface->order = dxva2->surface_order++; + + for( i = 0; i < 4; i++ ) + { + frame->data[i] = NULL; + frame->linesize[i] = 0; + if( i == 0 || i == 3 ) + frame->data[i] = (void*)surface->d3d; + } + return HB_WORK_OK; +} +/** + * nv12 to yuv of c reference + */ +static void hb_copy_from_nv12( uint8_t *dst, uint8_t *src[2], size_t src_pitch[2], unsigned width, unsigned height ) +{ + unsigned int i, j; + uint8_t *dstU, *dstV; + dstU = dst + width*height; + dstV = dstU + width*height/4; + unsigned int heithtUV, widthUV; + heithtUV = height/2; + widthUV = width/2; + + for( i = 0; i < height; i++ ) //Y + { + memcpy( dst + i * width, src[0] + i * src_pitch[0], width ); + } + for( i = 0; i < heithtUV; i++ ) + { + for( j = 0; j < widthUV; j++ ) + { + dstU[i * widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j); + dstV[i *widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j + 1); + } + } +} + +#ifdef USE_OPENCL +void hb_init_filter( cl_mem src, int srcwidth, int srcheight, uint8_t* dst, int dstwidth, int dstheight, int *crop ) +{ + T_FilterLink fl = {0}; + int STEP = srcwidth * srcheight * 3 / 2; + int OUTSTEP = dstwidth * dstheight * 3 / 2; + int HEIGHT = srcheight; + int LINESIZEY = srcwidth; + int LINESIZEUV = srcwidth / 2; + + cl_mem cl_outbuf; + + if( !hb_create_buffer( &(cl_outbuf), CL_MEM_WRITE_ONLY, OUTSTEP ) ) + { + hb_log("av_create_buffer cl_outbuf Error"); + return; + } + + fl.cl_outbuf = cl_outbuf; + + scale_run( src, fl.cl_outbuf, LINESIZEY, LINESIZEUV, HEIGHT ); + + hb_read_opencl_buffer( fl.cl_outbuf, dst, OUTSTEP ); + CL_FREE( cl_outbuf ); + + return; +} +#endif +/** + * lock frame data form surface. + * nv12 to yuv with opencl and with C reference + * scale with opencl + */ +int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine ) + +{ + LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3]; + D3DLOCKED_RECT lock; + if( FAILED( IDirect3DSurface9_LockRect( d3d, &lock, NULL, D3DLOCK_READONLY ))) + { + hb_log( "dxva2:Failed to lock surface" ); + return HB_WORK_ERROR; + } + + if( dxva2->render == MAKEFOURCC( 'N', 'V', '1', '2' )) + { + uint8_t *plane[2] = + { + lock.pBits, + (uint8_t*)lock.pBits + lock.Pitch * dxva2->surface_height + }; + size_t pitch[2] = + { + lock.Pitch, + lock.Pitch, + }; + + hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height ); + } + IDirect3DSurface9_UnlockRect( d3d ); + return HB_WORK_OK; +} + +/** + * create dxva2 service + * load library D3D9.dll + */ +hb_va_dxva2_t * hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id ) +{ + if( dxva2 ) + { + hb_va_close( dxva2 ); + dxva2 = NULL; + } + + hb_va_dxva2_t *dxva = calloc( 1, sizeof(*dxva) ); + if( !dxva ) return NULL; + dxva->codec_id = codec_id; + + dxva->hd3d9_dll = LoadLibrary( TEXT( "D3D9.DLL" ) ); + if( !dxva->hd3d9_dll ) + { + hb_log( "dxva2:cannot load d3d9.dll" ); + goto error; + } + dxva->hdxva2_dll = LoadLibrary( TEXT( "DXVA2.DLL" ) ); + if( !dxva->hdxva2_dll ) + { + hb_log( "dxva2:cannot load DXVA2.dll" ); + goto error; + } + + if( hb_d3d_create_device( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:Failed to create Direct3D device" ); + goto error; + } + + if( hb_d3d_create_device_manager( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:D3dCreateDeviceManager failed" ); + goto error; + } + + + if( hb_dx_create_video_service( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:DxCreateVideoService failed" ); + goto error; + } + + if( hb_dx_find_video_service_conversion( dxva, &dxva->input, &dxva->render ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:DxFindVideoServiceConversion failed" ); + goto error; + } + + dxva->do_job = HB_WORK_OK; + dxva->description = "DXVA2"; + + return dxva; + +error: + hb_va_close( dxva ); + return NULL; +} + +void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context ) +{ + if( p_context->width > 0 && p_context->height > 0 ) + { + if( hb_va_setup( dxva2, &p_context->hwaccel_context, p_context->width, p_context->height ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:hb_va_Setup failed" ); + hb_va_close( dxva2 ); + dxva2 = NULL; + } + } + if( dxva2 ) + { + dxva2->input_pts[0] = 0; + dxva2->input_pts[1] = 0; + if( dxva2->description ) + hb_log( "dxva2:Using %s for hardware decoding", dxva2->description ); + p_context->draw_horiz_band = NULL; + } + +} + +char* hb_get_pix_fmt_name( int pix_fmt ) +{ + static const char *ppsz_name[AV_PIX_FMT_NB] = + { + [AV_PIX_FMT_VDPAU_H264] = "AV_PIX_FMT_VDPAU_H264", + [AV_PIX_FMT_VAAPI_IDCT] = "AV_PIX_FMT_VAAPI_IDCT", + [AV_PIX_FMT_VAAPI_VLD] = "AV_PIX_FMT_VAAPI_VLD", + [AV_PIX_FMT_VAAPI_MOCO] = "AV_PIX_FMT_VAAPI_MOCO", + [AV_PIX_FMT_DXVA2_VLD] = "AV_PIX_FMT_DXVA2_VLD", + [AV_PIX_FMT_YUYV422] = "AV_PIX_FMT_YUYV422", + [AV_PIX_FMT_YUV420P] = "AV_PIX_FMT_YUV420P", + }; + + return ppsz_name[pix_fmt]; +} + +enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *p_context, const enum PixelFormat *pi_fmt ) +{ + int i; + for( i = 0; pi_fmt[i] != AV_PIX_FMT_NONE; i++ ) + { + hb_log( "dxva2:Available decoder output format %d (%s)", pi_fmt[i], hb_get_pix_fmt_name(pi_fmt[i]) ? : "Unknown" ); + if( pi_fmt[i] == AV_PIX_FMT_DXVA2_VLD ) + { + return pi_fmt[i]; + } + } + return avcodec_default_get_format( p_context, pi_fmt ); +} + +int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame ) +{ + frame->type = FF_BUFFER_TYPE_USER; + if( hb_va_get( dxva2, frame ) == HB_WORK_ERROR ) + { + hb_log( "VaGrabSurface failed" ); + return HB_WORK_ERROR; + } + return HB_WORK_OK; + +} + +int hb_check_hwd_fmt( int fmt ) +{ + int result = 1; + switch ( fmt ) + { + case AV_PIX_FMT_YUV420P16LE: + case AV_PIX_FMT_YUV420P16BE: + case AV_PIX_FMT_YUV422P16LE: + case AV_PIX_FMT_YUV422P16BE: + case AV_PIX_FMT_YUV444P16LE: + case AV_PIX_FMT_YUV444P16BE: + case AV_PIX_FMT_YUV420P9BE: + case AV_PIX_FMT_YUV420P9LE: + case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_YUV422P10BE: + case AV_PIX_FMT_YUV422P10LE: + case AV_PIX_FMT_YUV444P9BE: + case AV_PIX_FMT_YUV444P9LE: + case AV_PIX_FMT_YUV444P10BE: + case AV_PIX_FMT_YUV444P10LE: + case AV_PIX_FMT_YUV422P9BE: + case AV_PIX_FMT_YUV422P9LE: + case AV_PIX_FMT_GBRP9BE: + case AV_PIX_FMT_GBRP9LE: + case AV_PIX_FMT_GBRP10BE: + case AV_PIX_FMT_GBRP10LE: + case AV_PIX_FMT_GBRP16BE: + case AV_PIX_FMT_GBRP16LE: + case AV_PIX_FMT_YUVA420P9BE: + case AV_PIX_FMT_YUVA420P9LE: + case AV_PIX_FMT_YUVA422P9BE: + case AV_PIX_FMT_YUVA422P9LE: + case AV_PIX_FMT_YUVA444P9BE: + case AV_PIX_FMT_YUVA444P9LE: + case AV_PIX_FMT_YUVA420P10BE: + case AV_PIX_FMT_YUVA420P10LE: + result = 0; + } + return result; +} +#endif diff --git a/libhb/vadxva2.h b/libhb/vadxva2.h new file mode 100644 index 000000000..cd879b974 --- /dev/null +++ b/libhb/vadxva2.h @@ -0,0 +1,213 @@ +/* vadxva2.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + +#ifndef VA_DXVA2_H +#define VA_DXVA2_H + +#ifdef USE_HWD +#include "hbffmpeg.h" +#include "d3d9.h" +#include "libavcodec/dxva2.h" +#include "dxva2api.h" +#include "common.h" +#include "openclwrapper.h" + +#define HB_FOURCC( a, b, c, d ) ( ((uint32_t)a) | ( ((uint32_t)b) << 8 ) | ( ((uint32_t)c) << 16 ) | ( ((uint32_t)d) << 24 ) ) +#define MAKEFOURCC( a, b, c, d ) ((DWORD)(BYTE)(a) | ((DWORD)(BYTE)(b) << 8) | ((DWORD)(BYTE)(c) << 16) | ((DWORD)(BYTE)(d) << 24 )) +#define HB_CODEC_YV12 HB_FOURCC( 'Y', 'V', '1', '2' ) +#define HB_CODEC_NV12 HB_FOURCC( 'N', 'V', '1', '2' ) +#define DXVA2_E_NOT_INITIALIZED MAKE_HRESULT( 1, 4, 4096 ) +#define DXVA2_E_NEW_VIDEO_DEVICE MAKE_HRESULT( 1, 4, 4097 ) +#define DXVA2_E_VIDEO_DEVICE_LOCKED MAKE_HRESULT( 1, 4, 4098 ) +#define DXVA2_E_NOT_AVAILABLE MAKE_HRESULT( 1, 4, 4099 ) +#define VA_DXVA2_MAX_SURFACE_COUNT (64) + +static const GUID DXVA_NoEncrypt = { 0x1b81bed0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID IID_IDirectXVideoDecoderService = {0xfc51a551, 0xd5e7, 0x11d9, {0xaf, 0x55, 0x00, 0x05, 0x4e, 0x43, 0xff, 0x02}}; +static const GUID DXVA2_ModeMPEG2_MoComp = { 0xe6a9f44b, 0x61b0, 0x4563, {0x9e, 0xa4, 0x63, 0xd2, 0xa3, 0xc6, 0xfe, 0x66} }; +static const GUID DXVA2_ModeMPEG2_IDCT = { 0xbf22ad00, 0x03ea, 0x4690, {0x80, 0x77, 0x47, 0x33, 0x46, 0x20, 0x9b, 0x7e} }; +static const GUID DXVA2_ModeMPEG2_VLD = { 0xee27417f, 0x5e28, 0x4e65, {0xbe, 0xea, 0x1d, 0x26, 0xb5, 0x08, 0xad, 0xc9} }; +static const GUID DXVA2_ModeH264_A = { 0x1b81be64, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_B = { 0x1b81be65, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_C = { 0x1b81be66, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_D = { 0x1b81be67, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_E = { 0x1b81be68, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_F = { 0x1b81be69, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVADDI_Intel_ModeH264_A = { 0x604F8E64, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVADDI_Intel_ModeH264_C = { 0x604F8E66, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVADDI_Intel_ModeH264_E = { 0x604F8E68, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVA2_ModeWMV8_A = { 0x1b81be80, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV8_B = { 0x1b81be81, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_A = { 0x1b81be90, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_B = { 0x1b81be91, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_C = { 0x1b81be94, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_A = { 0x1b81beA0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_B = { 0x1b81beA1, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_C = { 0x1b81beA2, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_D = { 0x1b81beA3, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; + +typedef struct +{ + int width; + int height; + int rate; + int rate_base; + +}hb_dx_format; + +typedef struct +{ + LPDIRECT3DSURFACE9 d3d; + int refcount; + unsigned int order; + +} hb_va_surface_t; + +typedef struct +{ + uint8_t *base; + uint8_t *buffer; + size_t size; + +} hb_copy_cache_t; + +typedef struct +{ + const char *name; + D3DFORMAT format; + uint32_t codec; + +} hb_d3d_format_t; + +typedef struct +{ + const char *name; + const GUID *guid; + int codec; +} hb_dx_mode_t; + +typedef struct +{ + char *description; + int codec_id; + uint32_t i_chroma; + int width; + int height; + HINSTANCE hd3d9_dll; + HINSTANCE hdxva2_dll; + D3DPRESENT_PARAMETERS d3dpp; + LPDIRECT3D9 d3dobj; + D3DADAPTER_IDENTIFIER9 d3dai; + LPDIRECT3DDEVICE9 d3ddev; + UINT token; + IDirect3DDeviceManager9 *devmng; + HANDLE device; + IDirectXVideoDecoderService *vs; + GUID input; + D3DFORMAT render; + DXVA2_ConfigPictureDecode cfg; + IDirectXVideoDecoder *decoder; + D3DFORMAT output; + struct dxva_context hw; + unsigned surface_count; + unsigned surface_order; + int surface_width; + int surface_height; + uint32_t surface_chroma; + hb_va_surface_t surface[VA_DXVA2_MAX_SURFACE_COUNT]; + LPDIRECT3DSURFACE9 hw_surface[VA_DXVA2_MAX_SURFACE_COUNT]; + IDirectXVideoProcessorService *ps; + IDirectXVideoProcessor *vp; + int64_t input_pts[2]; + int64_t input_dts; + int do_job; + + // running nv12toyuv kernel. +#ifdef USE_OPENCL + cl_kernel nv12toyuv; + cl_mem cl_mem_nv12; + cl_mem cl_mem_yuv; + uint8_t * nv12toyuv_tmp_in; + uint8_t * nv12toyuv_tmp_out; +#endif +} hb_va_dxva2_t; + +typedef struct FilterLink_T +{ +#ifdef USE_OPENCL + cl_mem cl_inbuf; + cl_mem cl_outbuf; +#endif + uint8_t *mem_inbuf; + uint8_t *mem_outbuf; + int width; + int height; + int linesizeY; + int linesizeUV; + int inmemdataflag; + int outmemdataflag; + int incldataflag; + int outcldataflag; + int framenum; + int outputSize; +} T_FilterLink; + +static const hb_d3d_format_t d3d_formats[] = +{ + { "YV12", MAKEFOURCC( 'Y', 'V', '1', '2' ), HB_CODEC_YV12 }, + { "NV12", MAKEFOURCC( 'N', 'V', '1', '2' ), HB_CODEC_NV12 }, + { NULL, 0, 0 } +}; + +static const hb_dx_mode_t dxva2_modes[] = +{ + { "DXVA2_ModeMPEG2_VLD", &DXVA2_ModeMPEG2_VLD, AV_CODEC_ID_MPEG2VIDEO }, + { "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp, 0 }, + { "DXVA2_ModeMPEG2_IDCT", &DXVA2_ModeMPEG2_IDCT, 0 }, + + { "H.264 variable-length decoder (VLD), FGT", &DXVA2_ModeH264_F, AV_CODEC_ID_H264 }, + { "H.264 VLD, no FGT", &DXVA2_ModeH264_E, AV_CODEC_ID_H264 }, + { "H.264 VLD, no FGT (Intel)", &DXVADDI_Intel_ModeH264_E, AV_CODEC_ID_H264 }, + { "H.264 IDCT, FGT", &DXVA2_ModeH264_D, 0 }, + { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0 }, + { "H.264 inverse discrete cosine transform (IDCT), no FGT (Intel)", &DXVADDI_Intel_ModeH264_C, 0 }, + { "H.264 MoComp, FGT", &DXVA2_ModeH264_B, 0 }, + { "H.264 motion compensation (MoComp), no FGT", &DXVA2_ModeH264_A, 0 }, + { "H.264 motion compensation (MoComp), no FGT (Intel)", &DXVADDI_Intel_ModeH264_A, 0 }, + + { "Windows Media Video 8 MoComp", &DXVA2_ModeWMV8_B, 0 }, + { "Windows Media Video 8 post processing", &DXVA2_ModeWMV8_A, 0 }, + + { "Windows Media Video 9 IDCT", &DXVA2_ModeWMV9_C, 0 }, + { "Windows Media Video 9 MoComp", &DXVA2_ModeWMV9_B, 0 }, + { "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 }, + + { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_VC1 }, + { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_WMV3 }, + { "VC-1 IDCT", &DXVA2_ModeVC1_C, 0 }, + { "VC-1 MoComp", &DXVA2_ModeVC1_B, 0 }, + { "VC-1 post processing", &DXVA2_ModeVC1_A, 0 }, + + { NULL, NULL, 0 } +}; + +int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame ); +int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine ); +enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *, const enum PixelFormat * ); +hb_va_dxva2_t *hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id ); +void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context ); +void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame ); +void hb_va_close( hb_va_dxva2_t *dxva2 ); +int hb_check_hwd_fmt( int fmt ); +#endif +#endif diff --git a/libhb/work.c b/libhb/work.c index 7f00ddab3..d44f763be 100644 --- a/libhb/work.c +++ b/libhb/work.c @@ -10,6 +10,7 @@ #include "hb.h" #include "a52dec/a52.h" #include "libavformat/avformat.h" +#include "openclwrapper.h" #ifdef USE_QSV #include "qsv_common.h" @@ -532,8 +533,24 @@ static void do_job(hb_job_t *job) job->list_work = hb_list_init(); +#ifdef USE_OPENCL + /* init opencl environment */ + if (job->use_opencl) + job->use_opencl = !hb_init_opencl_run_env(0, NULL, "-I."); +#else + job->use_opencl = 0; +#endif + hb_log( "starting job" ); + if (job->use_opencl || job->use_hwd) + { + hb_log("Using GPU: Yes."); + } + else + { + hb_log("Using GPU: No."); + } /* Look for the scanned subtitle in the existing subtitle list * select_subtitle implies that we did a scan. */ if( !job->indepth_scan && interjob->select_subtitle ) @@ -792,6 +809,9 @@ static void do_job(hb_job_t *job) init.pix_fmt = AV_PIX_FMT_YUV420P; init.width = title->width; init.height = title->height; +#ifdef USE_OPENCL + init.use_dxva = hb_use_dxva( title ); +#endif init.par_width = job->anamorphic.par_width; init.par_height = job->anamorphic.par_height; memcpy(init.crop, title->crop, sizeof(int[4])); @@ -1093,6 +1113,7 @@ static void do_job(hb_job_t *job) title->video_codec_param = AV_CODEC_ID_MPEG2VIDEO; } #endif + hb_list_add( job->list_work, ( w = hb_get_work( vcodec ) ) ); w->codec_param = title->video_codec_param; w->fifo_in = job->fifo_mpeg2; @@ -1554,7 +1575,6 @@ static inline void copy_chapter( hb_buffer_t * dst, hb_buffer_t * src ) if( src && dst && src->s.start == dst->s.start) { // restore log below to debug chapter mark propagation problems - //hb_log("work %s: Copying Chapter Break @ %"PRId64, w->name, src->s.start); dst->s.new_chap = src->s.new_chap; } } |