diff options
Diffstat (limited to 'libhb')
-rw-r--r-- | libhb/common.c | 13 | ||||
-rw-r--r-- | libhb/common.h | 70 | ||||
-rw-r--r-- | libhb/cropscale.c | 81 | ||||
-rw-r--r-- | libhb/deca52.c | 6 | ||||
-rw-r--r-- | libhb/decavcodec.c | 235 | ||||
-rw-r--r-- | libhb/detelecine.c | 2 | ||||
-rw-r--r-- | libhb/dxva2api.c | 38 | ||||
-rw-r--r-- | libhb/dxva2api.h | 822 | ||||
-rw-r--r-- | libhb/fifo.c | 83 | ||||
-rw-r--r-- | libhb/hb.c | 5 | ||||
-rw-r--r-- | libhb/hbffmpeg.h | 1 | ||||
-rw-r--r-- | libhb/internal.h | 9 | ||||
-rw-r--r-- | libhb/module.defs | 13 | ||||
-rw-r--r-- | libhb/oclnv12toyuv.c | 285 | ||||
-rw-r--r-- | libhb/oclnv12toyuv.h | 35 | ||||
-rw-r--r-- | libhb/oclscale.c | 271 | ||||
-rw-r--r-- | libhb/openclkernels.h | 771 | ||||
-rw-r--r-- | libhb/openclwrapper.c | 1261 | ||||
-rw-r--r-- | libhb/openclwrapper.h | 89 | ||||
-rw-r--r-- | libhb/scale.c | 1020 | ||||
-rw-r--r-- | libhb/scale.h | 324 | ||||
-rw-r--r-- | libhb/scale_kernel.c | 223 | ||||
-rw-r--r-- | libhb/scale_kernel.h | 20 | ||||
-rw-r--r-- | libhb/stream.c | 56 | ||||
-rw-r--r-- | libhb/vadxva2.c | 812 | ||||
-rw-r--r-- | libhb/vadxva2.h | 213 | ||||
-rw-r--r-- | libhb/work.c | 22 |
27 files changed, 6720 insertions, 60 deletions
diff --git a/libhb/common.c b/libhb/common.c index 18dd9fc53..3d5cf36fb 100644 --- a/libhb/common.c +++ b/libhb/common.c @@ -2829,6 +2829,7 @@ static void job_reset_for_mac_ui( hb_job_t * job, hb_title_t * title ) job->metadata = hb_metadata_copy( title->metadata ); } + static void job_setup( hb_job_t * job, hb_title_t * title ) { if ( job == NULL || title == NULL ) @@ -4129,3 +4130,15 @@ void hb_hexdump( hb_debug_level_t level, const char * label, const uint8_t * dat hb_deep_log( level, " %-50s%20s", line, ascii ); } } + +int hb_gui_use_hwd_flag = 0; +int hb_use_dxva( hb_title_t * title ) +{ + return ( (title->video_codec_param == AV_CODEC_ID_MPEG2VIDEO + || title->video_codec_param == AV_CODEC_ID_H264 + || title->video_codec_param == AV_CODEC_ID_VC1 + || title->video_codec_param == AV_CODEC_ID_WMV3 + || title->video_codec_param == AV_CODEC_ID_MPEG4 ) + && title->opaque_priv ); +} + diff --git a/libhb/common.h b/libhb/common.h index 19c1049cd..abc799ef1 100644 --- a/libhb/common.h +++ b/libhb/common.h @@ -20,7 +20,13 @@ #include <sys/types.h> #include <sys/stat.h> #include <dirent.h> - +#ifdef USE_OPENCL +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif +#endif /* * It seems WinXP doesn't align the stack of new threads to 16 bytes. * To prevent crashes in SSE functions, we need to force stack alignement @@ -154,6 +160,7 @@ int hb_subtitle_can_burn( int source ); int hb_subtitle_can_pass( int source, int mux ); hb_attachment_t *hb_attachment_copy(const hb_attachment_t *src); + hb_list_t *hb_attachment_list_copy(const hb_list_t *src); void hb_attachment_close(hb_attachment_t **attachment); @@ -353,6 +360,8 @@ struct hb_title_set_s int feature; // Detected DVD feature title }; +extern int hb_gui_use_hwd_flag; + /****************************************************************************** * hb_job_t: settings to be filled by the UI * Update win/CS/HandBrake.Interop/HandBrakeInterop/HbLib/hb_job_s.cs when changing this struct @@ -370,7 +379,7 @@ struct hb_job_s int chapter_start; int chapter_end; - /* Include chapter marker track in mp4? */ + /* Include chapter marker track in mp4? */ int chapter_markers; /* Picture settings: @@ -518,6 +527,10 @@ struct hb_job_s uint32_t frames_to_skip; // decode but discard this many frames // initially (for frame accurate positioning // to non-I frames). + int use_opencl;/* 0 is disable use of opencl. 1 is enable use of opencl */ + int use_hwd; + int use_decomb; + int use_detelecine; #ifdef USE_QSV // QSV-specific settings struct @@ -895,6 +908,8 @@ struct hb_title_s uint32_t flags; // set if video stream doesn't have IDR frames + int opencl_support; + int hwd_support; #define HBTF_NO_IDR (1 << 0) #define HBTF_SCAN_COMPLETE (1 << 0) }; @@ -1070,6 +1085,52 @@ extern hb_work_object_t hb_reader; #define HB_FILTER_DROP 3 #define HB_FILTER_DONE 4 +typedef struct hb_oclscale_s +{ +#ifdef USE_OPENCL + int initialized; + // bicubic scale weights + cl_mem bicubic_x_weights; + cl_mem bicubic_y_weights; + cl_float xscale; + cl_float yscale; + int width; + int height; + // horizontal scaling and vertical scaling kernel handle + cl_kernel m_kernel; + int use_ocl_mem; // 0 use host memory. 1 use gpu oclmem +#endif +} hb_oclscale_t; + +#ifdef USE_OPENCL +int hb_ocl_scale( hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os ); +#endif + +#ifdef USE_OPENCL +int hb_use_dxva( hb_title_t * title ); +// create opencl buffer +#define CREATEBUF( out, flags, size )\ + {\ + out = clCreateBuffer( kenv->context, (flags), (size), NULL, &status );\ + if( CL_SUCCESS != status ) return -1;\ + } + +#define OCLCHECK( method, ... )\ + status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\ + printf( # method " error '%d'\n", status ); return status; } + +#define CL_FREE( buf )\ +{\ + if( buf )\ + {\ + { clReleaseMemObject( buf ); }\ + buf = NULL;\ + }\ +} + + +#endif + typedef struct hb_filter_init_s { hb_job_t * job; @@ -1082,6 +1143,10 @@ typedef struct hb_filter_init_s int vrate_base; int vrate; int cfr; +#ifdef USE_OPENCL + int use_dxva; +#endif + } hb_filter_init_t; typedef struct hb_filter_info_s @@ -1140,6 +1205,7 @@ enum HB_FILTER_DENOISE, HB_FILTER_RENDER_SUB, HB_FILTER_CROP_SCALE, + // Finally filters that don't care what order they are in, // except that they must be after the above filters HB_FILTER_ROTATE, diff --git a/libhb/cropscale.c b/libhb/cropscale.c index d217393d1..c7d7d9948 100644 --- a/libhb/cropscale.c +++ b/libhb/cropscale.c @@ -9,9 +9,12 @@ #include "hb.h" #include "hbffmpeg.h" +#include "common.h" + struct hb_filter_private_s { + hb_job_t *job; int width_in; int height_in; int pix_fmt; @@ -19,6 +22,13 @@ struct hb_filter_private_s int width_out; int height_out; int crop[4]; + +#ifdef USE_OPENCL + int use_dxva; + int use_decomb; + int use_detelecine; + hb_oclscale_t *os; //ocl scaler handler +#endif struct SwsContext * context; }; @@ -53,11 +63,23 @@ static int hb_crop_scale_init( hb_filter_object_t * filter, hb_filter_private_t * pv = filter->private_data; // TODO: add pix format option to settings + pv->job = init->job; pv->pix_fmt_out = init->pix_fmt; pv->width_in = init->width; pv->height_in = init->height; pv->width_out = init->width - (init->crop[2] + init->crop[3]); pv->height_out = init->height - (init->crop[0] + init->crop[1]); +#ifdef USE_OPENCL + pv->use_dxva = init->use_dxva; + pv->use_decomb = init->job->use_decomb; + pv->use_detelecine = init->job->use_detelecine; + + if( pv->job->use_opencl ) + { + pv->os = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); + memset( pv->os, 0, sizeof( hb_oclscale_t ) ); + } +#endif memcpy( pv->crop, init->crop, sizeof( int[4] ) ); if( filter->settings ) { @@ -71,6 +93,9 @@ static int hb_crop_scale_init( hb_filter_object_t * filter, init->width = pv->width_out; init->height = pv->height_out; memcpy( init->crop, pv->crop, sizeof( int[4] ) ); +#ifdef USE_OPENCL + pv->use_dxva = init->use_dxva; +#endif return 0; } @@ -111,8 +136,16 @@ static void hb_crop_scale_close( hb_filter_object_t * filter ) { return; } +#ifdef USE_OPENCL - if ( pv->context ) + if( pv->job->use_opencl && pv->os ) + { + CL_FREE( pv->os->bicubic_x_weights ); + CL_FREE( pv->os->bicubic_y_weights ); + free( pv->os ); + } +#endif + if( pv->context ) { sws_freeContext( pv->context ); } @@ -121,6 +154,25 @@ static void hb_crop_scale_close( hb_filter_object_t * filter ) filter->private_data = NULL; } +#ifdef USE_OPENCL +static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, int h ) +{ + if( dstride == sstride ) + { + memcpy( dst, src, dstride * h ); + return dst + dstride * h; + } + int lbytes = dstride <= sstride ? dstride : sstride; + while( --h >= 0 ) + { + memcpy( dst, src, lbytes ); + src += sstride; + dst += dstride; + } + return dst; +} +#endif + static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) { AVPicture pic_in; @@ -137,6 +189,15 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) av_picture_crop( &pic_crop, &pic_in, in->f.fmt, pv->crop[0], pv->crop[2] ); +#ifdef USE_OPENCL + // Use bicubic OpenCL scaling when selected and when downsampling < 4:1; + if ((pv->job->use_opencl) && (pv->width_out * 4 > pv->width_in) && (in->cl.buffer != NULL) && (out->cl.buffer != NULL)) + { + hb_ocl_scale(in, out, pv->crop, pv->os); + } + else + { +#endif if ( !pv->context || pv->width_in != in->f.width || pv->height_in != in->f.height || @@ -164,7 +225,9 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) pic_crop.linesize, 0, in->f.height - (pv->crop[0] + pv->crop[1]), pic_out.data, pic_out.linesize); - +#ifdef USE_OPENCL + } +#endif out->s = in->s; hb_buffer_move_subs( out, in ); return out; @@ -198,6 +261,18 @@ static int hb_crop_scale_work( hb_filter_object_t * filter, pv->width_out = in->f.width - (pv->crop[2] + pv->crop[3]); pv->height_out = in->f.height - (pv->crop[0] + pv->crop[1]); } +#ifdef USE_OPENCL + if ( (in->f.fmt == pv->pix_fmt_out && + !pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] && + in->f.width == pv->width_out && in->f.height == pv->height_out) && + (pv->use_decomb == 0) && (pv->use_detelecine == 0) || + (pv->use_dxva && in->f.width == pv->width_out && in->f.height == pv->height_out) ) + { + *buf_out = in; + *buf_in = NULL; + return HB_FILTER_OK; + } +#else if ( in->f.fmt == pv->pix_fmt_out && !pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] && in->f.width == pv->width_out && in->f.height == pv->height_out ) @@ -206,6 +281,8 @@ static int hb_crop_scale_work( hb_filter_object_t * filter, *buf_in = NULL; return HB_FILTER_OK; } +#endif + *buf_out = crop_scale( pv, in ); return HB_FILTER_OK; diff --git a/libhb/deca52.c b/libhb/deca52.c index 3c78d157c..f5dac1f96 100644 --- a/libhb/deca52.c +++ b/libhb/deca52.c @@ -336,7 +336,11 @@ static hb_buffer_t* Decode(hb_work_object_t *w) } double frame_dur = (6. * 256. * 90000.) / pv->rate; - double pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts; + double pts; + if (hb_gui_use_hwd_flag == 1 && ipts != -1) + pts = ((double)ipts >= pv->next_expected_pts) ? (double)ipts : pv->next_expected_pts; + else + pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts; /* AC3 passthrough: don't decode the AC3 frame */ if (audio->config.out.codec == HB_ACODEC_AC3_PASS) diff --git a/libhb/decavcodec.c b/libhb/decavcodec.c index 0e760b8db..84ad5103c 100644 --- a/libhb/decavcodec.c +++ b/libhb/decavcodec.c @@ -42,6 +42,10 @@ #include "hbffmpeg.h" #include "audio_resample.h" +#ifdef USE_HWD +#include "vadxva2.h" +#endif + #ifdef USE_QSV #include "enc_qsv.h" #include "qsv_common.h" @@ -104,7 +108,11 @@ struct hb_work_private_s int sws_pix_fmt; int cadence[12]; int wait_for_keyframe; - +#ifdef USE_HWD + hb_va_dxva2_t *dxva2; + uint8_t *dst_frame; + hb_oclscale_t *opencl_scale; +#endif hb_audio_resample_t *resample; #ifdef USE_QSV @@ -381,6 +389,22 @@ static void closePrivData( hb_work_private_t ** ppv ) hb_list_empty( &pv->list ); } hb_audio_resample_free(pv->resample); + +#ifdef USE_HWD + if ( pv->opencl_scale ) + { + free( pv->opencl_scale ); + } + + if ( pv->dxva2 ) + { +#ifdef USE_OPENCL + CL_FREE( pv->dxva2->cl_mem_nv12 ); +#endif + hb_va_close( pv->dxva2 ); + } +#endif + #ifdef USE_QSV_PTS_WORKAROUND if (pv->qsv.decode && pv->qsv.pts_list != NULL) @@ -394,6 +418,7 @@ static void closePrivData( hb_work_private_t ** ppv ) hb_list_close(&pv->qsv.pts_list); } #endif + free( pv ); } *ppv = NULL; @@ -402,7 +427,9 @@ static void closePrivData( hb_work_private_t ** ppv ) static void decavcodecClose( hb_work_object_t * w ) { hb_work_private_t * pv = w->private_data; - +#ifdef USE_HWD + if( pv->dst_frame ) free( pv->dst_frame ); +#endif if ( pv ) { closePrivData( &pv ); @@ -686,11 +713,47 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame ) } else { - w = pv->job->title->width; - h = pv->job->title->height; + w = pv->job->title->width; + h = pv->job->title->height; } - hb_buffer_t *buf = hb_video_buffer_init( w, h ); +#ifdef USE_HWD + if (pv->dxva2 && pv->job) + { + hb_buffer_t *buf; + int ww, hh; + + buf = hb_video_buffer_init( w, h ); + ww = w; + hh = h; + + if( !pv->dst_frame ) + { + pv->dst_frame = malloc( ww * hh * 3 / 2 ); + } + if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->opencl_scale, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR ) + { + hb_log( "hb_va_Extract failed!!!!!!" ); + } + w = buf->plane[0].stride; + h = buf->plane[0].height; + uint8_t *dst = buf->plane[0].data; + copy_plane( dst, pv->dst_frame, w, ww, h ); + w = buf->plane[1].stride; + h = buf->plane[1].height; + dst = buf->plane[1].data; + copy_plane( dst, pv->dst_frame + ww * hh, w, ww >> 1, h ); + w = buf->plane[2].stride; + h = buf->plane[2].height; + dst = buf->plane[2].data; + copy_plane( dst, pv->dst_frame + ww * hh +( ( ww * hh ) >> 2 ), w, ww >> 1, h ); + return buf; + } + else +#endif + { + hb_buffer_t *buf = hb_video_buffer_init( w, h ); + #ifdef USE_QSV // no need to copy the frame data when decoding with QSV to opaque memory if (pv->qsv.decode && @@ -701,52 +764,90 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame ) } #endif - uint8_t *dst = buf->data; + uint8_t *dst = buf->data; - if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width || - h != context->height) - { - // have to convert to our internal color space and/or rescale - AVPicture dstpic; - hb_avpicture_fill(&dstpic, buf); - - if (pv->sws_context == NULL || - pv->sws_width != context->width || - pv->sws_height != context->height || - pv->sws_pix_fmt != context->pix_fmt) + if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width || + h != context->height) + { + // have to convert to our internal color space and/or rescale + AVPicture dstpic; + hb_avpicture_fill(&dstpic, buf); + + if (pv->sws_context == NULL || + pv->sws_width != context->width || + pv->sws_height != context->height || + pv->sws_pix_fmt != context->pix_fmt) + { + if (pv->sws_context != NULL) + sws_freeContext(pv->sws_context); + pv->sws_context = hb_sws_get_context(context->width, + context->height, + context->pix_fmt, + w, h, AV_PIX_FMT_YUV420P, + SWS_LANCZOS|SWS_ACCURATE_RND); + pv->sws_width = context->width; + pv->sws_height = context->height; + pv->sws_pix_fmt = context->pix_fmt; + } + sws_scale(pv->sws_context, + (const uint8_t* const *)frame->data, frame->linesize, + 0, context->height, dstpic.data, dstpic.linesize); + } + else { - if (pv->sws_context != NULL) - sws_freeContext(pv->sws_context); - pv->sws_context = hb_sws_get_context(context->width, - context->height, - context->pix_fmt, - w, h, AV_PIX_FMT_YUV420P, - SWS_LANCZOS|SWS_ACCURATE_RND); - pv->sws_width = context->width; - pv->sws_height = context->height; - pv->sws_pix_fmt = context->pix_fmt; + w = buf->plane[0].stride; + h = buf->plane[0].height; + dst = buf->plane[0].data; + copy_plane( dst, frame->data[0], w, frame->linesize[0], h ); + w = buf->plane[1].stride; + h = buf->plane[1].height; + dst = buf->plane[1].data; + copy_plane( dst, frame->data[1], w, frame->linesize[1], h ); + w = buf->plane[2].stride; + h = buf->plane[2].height; + dst = buf->plane[2].data; + copy_plane( dst, frame->data[2], w, frame->linesize[2], h ); } - sws_scale(pv->sws_context, - (const uint8_t* const *)frame->data, frame->linesize, - 0, context->height, dstpic.data, dstpic.linesize); + return buf; + } +} + +#ifdef USE_HWD + +static int get_frame_buf_hwd( AVCodecContext *context, AVFrame *frame ) +{ + + hb_work_private_t *pv = (hb_work_private_t*)context->opaque; + if ( (pv != NULL) && pv->dxva2 ) + { + int result = HB_WORK_ERROR; + hb_work_private_t *pv = (hb_work_private_t*)context->opaque; + result = hb_va_get_frame_buf( pv->dxva2, context, frame ); + if( result == HB_WORK_ERROR ) + return avcodec_default_get_buffer( context, frame ); + return 0; } else + return avcodec_default_get_buffer( context, frame ); +} + +static void hb_ffmpeg_release_frame_buf( struct AVCodecContext *p_context, AVFrame *frame ) +{ + hb_work_private_t *p_dec = (hb_work_private_t*)p_context->opaque; + int i; + if( p_dec->dxva2 ) { - w = buf->plane[0].stride; - h = buf->plane[0].height; - dst = buf->plane[0].data; - copy_plane( dst, frame->data[0], w, frame->linesize[0], h ); - w = buf->plane[1].stride; - h = buf->plane[1].height; - dst = buf->plane[1].data; - copy_plane( dst, frame->data[1], w, frame->linesize[1], h ); - w = buf->plane[2].stride; - h = buf->plane[2].height; - dst = buf->plane[2].data; - copy_plane( dst, frame->data[2], w, frame->linesize[2], h ); + hb_va_release( p_dec->dxva2, frame ); + } + else if( !frame->opaque ) + { + if( frame->type == FF_BUFFER_TYPE_INTERNAL ) + avcodec_default_release_buffer( p_context, frame ); } - return buf; + for( i = 0; i < 4; i++ ) + frame->data[i] = NULL; } +#endif static void log_chapter( hb_work_private_t *pv, int chap_num, int64_t pts ) { @@ -979,16 +1080,27 @@ static int decodeFrame( hb_work_object_t *w, uint8_t *data, int size, int sequen if ( !pv->frame_duration_set ) compute_frame_duration( pv ); + double pts; double frame_dur = pv->duration; if ( frame.repeat_pict ) { frame_dur += frame.repeat_pict * pv->field_duration; } - +#ifdef USE_HWD + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + if( avp.pts>0 ) + { + if( pv->dxva2->input_pts[0] != 0 && pv->dxva2->input_pts[1] == 0 ) + frame.pkt_pts = pv->dxva2->input_pts[0]; + else + frame.pkt_pts = pv->dxva2->input_pts[0]<pv->dxva2->input_pts[1] ? pv->dxva2->input_pts[0] : pv->dxva2->input_pts[1]; + } + } +#endif // If there was no pts for this frame, assume constant frame rate // video & estimate the next frame time from the last & duration. - double pts; - if (frame.pkt_pts == AV_NOPTS_VALUE) + if (frame.pkt_pts == AV_NOPTS_VALUE || hb_gui_use_hwd_flag == 1) { pts = pv->pts_next; } @@ -996,6 +1108,7 @@ static int decodeFrame( hb_work_object_t *w, uint8_t *data, int size, int sequen { pts = frame.pkt_pts; } + pv->pts_next = pts + frame_dur; if ( frame.top_field_first ) @@ -1252,6 +1365,25 @@ static int decavcodecvInit( hb_work_object_t * w, hb_job_t * job ) pv->context->workaround_bugs = FF_BUG_AUTODETECT; pv->context->err_recognition = AV_EF_CRCCHECK; pv->context->error_concealment = FF_EC_GUESS_MVS|FF_EC_DEBLOCK; +#ifdef USE_HWD + if ( pv->job && job->use_hwd && hb_use_dxva( pv->title ) ) + { + pv->dxva2 = hb_va_create_dxva2( pv->dxva2, w->codec_param ); + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + hb_va_new_dxva2( pv->dxva2, pv->context ); + pv->context->slice_flags |= SLICE_FLAG_ALLOW_FIELD; + pv->context->opaque = pv; + pv->context->get_buffer = get_frame_buf_hwd; + pv->context->release_buffer = hb_ffmpeg_release_frame_buf; + pv->context->get_format = hb_ffmpeg_get_format; + pv->opencl_scale = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); + memset( pv->opencl_scale, 0, sizeof( hb_oclscale_t ) ); + pv->threads = 1; + } + } +#endif + #ifdef USE_QSV if (pv->qsv.decode) @@ -1488,6 +1620,16 @@ static int decavcodecvWork( hb_work_object_t * w, hb_buffer_t ** buf_in, pv->new_chap = in->s.new_chap; pv->chap_time = pts >= 0? pts : pv->pts_next; } +#ifdef USE_HWD + if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK ) + { + if( pv->dxva2->input_pts[0] <= pv->dxva2->input_pts[1] ) + pv->dxva2->input_pts[0] = pts; + else if( pv->dxva2->input_pts[0] > pv->dxva2->input_pts[1] ) + pv->dxva2->input_pts[1] = pts; + pv->dxva2->input_dts = dts; + } +#endif decodeVideo( w, in->data, in->size, in->sequence, pts, dts, in->s.frametype ); hb_buffer_close( &in ); *buf_out = link_buf_list( pv ); @@ -1746,7 +1888,6 @@ hb_work_object_t hb_decavcodecv = .info = decavcodecvInfo, .bsinfo = decavcodecvBSInfo }; - static void decodeAudio(hb_audio_t *audio, hb_work_private_t *pv, uint8_t *data, int size, int64_t pts) { diff --git a/libhb/detelecine.c b/libhb/detelecine.c index 91c97e878..2fdc48038 100644 --- a/libhb/detelecine.c +++ b/libhb/detelecine.c @@ -485,7 +485,7 @@ static void pullup_print_aff_and_breaks(struct pullup_context * c, int i; struct pullup_field * f0 = f; const char aff_l[] = "+..", aff_r[] = "..+"; - printf( "\naffinity: " ); + hb_log( "affinity: " ); for( i = 0; i < 4; i++ ) { printf( "%c%d%c", diff --git a/libhb/dxva2api.c b/libhb/dxva2api.c new file mode 100644 index 000000000..004cd681a --- /dev/null +++ b/libhb/dxva2api.c @@ -0,0 +1,38 @@ +/* dxva2api.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ +#ifdef USE_HWD +#include "dxva2api.h" + +__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ ) +{ + return (FLOAT)_fixed_.Value + (FLOAT)_fixed_.Fraction / 0x10000; +} + +__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha() +{ + DXVA2_Fixed32 _fixed_; + _fixed_.Fraction = 0; + _fixed_.Value = 0; + _fixed_.ll = 1; + return _fixed_; +} + + +__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ ) +{ + DXVA2_Fixed32 _fixed_; + _fixed_.Fraction = LOWORD( _float_ * 0x10000 ); + _fixed_.Value = HIWORD( _float_ * 0x10000 ); + return _fixed_; +} +#endif diff --git a/libhb/dxva2api.h b/libhb/dxva2api.h new file mode 100644 index 000000000..915fc0085 --- /dev/null +++ b/libhb/dxva2api.h @@ -0,0 +1,822 @@ +/* dxva2api.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + + +#ifndef _DXVA2API_H +#define _DXVA2API_H +#ifdef USE_HWD +#define MINGW_DXVA2API_H_VERSION (2) + +#if __GNUC__ >= 3 +#pragma GCC system_header +#endif + +#include <objbase.h> +#include <d3d9.h> + +/* Define it to allow using nameless struct/union (non C99 compliant) to match + * the official documentation. */ +//#define DXVA2API_USE_BITFIELDS + +/****************STRUCTURES******************/ +#pragma pack(push, 1) + +#define DXVA2API_USE_BITFIELDS + +typedef struct _DXVA2_ExtendedFormat { +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + UINT SampleFormat : 8; + UINT VideoChromaSubsampling : 4; + UINT NominalRange : 3; + UINT VideoTransferMatrix : 3; + UINT VideoLighting : 4; + UINT VideoPrimaries : 5; + UINT VideoTransferFunction : 5; + }; + UINT value; + }; +#else + UINT value; +#endif +} DXVA2_ExtendedFormat; + +typedef struct _DXVA2_Frequency { + UINT Numerator; + UINT Denominator; +} DXVA2_Frequency; + +typedef struct _DXVA2_VideoDesc { + UINT SampleWidth; + UINT SampleHeight; + DXVA2_ExtendedFormat SampleFormat; + D3DFORMAT Format; + DXVA2_Frequency InputSampleFreq; + DXVA2_Frequency OutputFrameFreq; + UINT UABProtectionLevel; + UINT Reserved; +} DXVA2_VideoDesc; + +typedef struct _DXVA2_ConfigPictureDecode { + GUID guidConfigBitstreamEncryption; + GUID guidConfigMBcontrolEncryption; + GUID guidConfigResidDiffEncryption; + UINT ConfigBitstreamRaw; + UINT ConfigMBcontrolRasterOrder; + UINT ConfigResidDiffHost; + UINT ConfigSpatialResid8; + UINT ConfigResid8Subtraction; + UINT ConfigSpatialHost8or9Clipping; + UINT ConfigSpatialResidInterleaved; + UINT ConfigIntraResidUnsigned; + UINT ConfigResidDiffAccelerator; + UINT ConfigHostInverseScan; + UINT ConfigSpecificIDCT; + UINT Config4GroupedCoefs; + USHORT ConfigMinRenderTargetBuffCount; + USHORT ConfigDecoderSpecific; +} DXVA2_ConfigPictureDecode; + +typedef struct _DXVA2_DecodeBufferDesc { + DWORD CompressedBufferType; + UINT BufferIndex; + UINT DataOffset; + UINT DataSize; + UINT FirstMBaddress; + UINT NumMBsInBuffer; + UINT Width; + UINT Height; + UINT Stride; + UINT ReservedBits; + PVOID pvPVPState; +} DXVA2_DecodeBufferDesc; + +typedef struct _DXVA2_DecodeExtensionData { + UINT Function; + PVOID pPrivateInputData; + UINT PrivateInputDataSize; + PVOID pPrivateOutputData; + UINT PrivateOutputDataSize; +} DXVA2_DecodeExtensionData; + +typedef struct _DXVA2_DecodeExecuteParams { + UINT NumCompBuffers; + DXVA2_DecodeBufferDesc *pCompressedBuffers; + DXVA2_DecodeExtensionData *pExtensionData; +} DXVA2_DecodeExecuteParams; + +enum { + DXVA2_VideoDecoderRenderTarget = 0, + DXVA2_VideoProcessorRenderTarget= 1, + DXVA2_VideoSoftwareRenderTarget = 2 +}; + +enum { + DXVA2_PictureParametersBufferType = 0, + DXVA2_MacroBlockControlBufferType = 1, + DXVA2_ResidualDifferenceBufferType = 2, + DXVA2_DeblockingControlBufferType = 3, + DXVA2_InverseQuantizationMatrixBufferType = 4, + DXVA2_SliceControlBufferType = 5, + DXVA2_BitStreamDateBufferType = 6, + DXVA2_MotionVectorBuffer = 7, + DXVA2_FilmGrainBuffer = 8 +}; + +/* DXVA MPEG-I/II and VC-1 */ +typedef struct _DXVA_PictureParameters { + USHORT wDecodedPictureIndex; + USHORT wDeblockedPictureIndex; + USHORT wForwardRefPictureIndex; + USHORT wBackwardRefPictureIndex; + USHORT wPicWidthInMBminus1; + USHORT wPicHeightInMBminus1; + UCHAR bMacroblockWidthMinus1; + UCHAR bMacroblockHeightMinus1; + UCHAR bBlockWidthMinus1; + UCHAR bBlockHeightMinus1; + UCHAR bBPPminus1; + UCHAR bPicStructure; + UCHAR bSecondField; + UCHAR bPicIntra; + UCHAR bPicBackwardPrediction; + UCHAR bBidirectionalAveragingMode; + UCHAR bMVprecisionAndChromaRelation; + UCHAR bChromaFormat; + UCHAR bPicScanFixed; + UCHAR bPicScanMethod; + UCHAR bPicReadbackRequests; + UCHAR bRcontrol; + UCHAR bPicSpatialResid8; + UCHAR bPicOverflowBlocks; + UCHAR bPicExtrapolation; + UCHAR bPicDeblocked; + UCHAR bPicDeblockConfined; + UCHAR bPic4MVallowed; + UCHAR bPicOBMC; + UCHAR bPicBinPB; + UCHAR bMV_RPS; + UCHAR bReservedBits; + USHORT wBitstreamFcodes; + USHORT wBitstreamPCEelements; + UCHAR bBitstreamConcealmentNeed; + UCHAR bBitstreamConcealmentMethod; +} DXVA_PictureParameters, *LPDXVA_PictureParameters; + +typedef struct _DXVA_QmatrixData { + BYTE bNewQmatrix[4]; + WORD Qmatrix[4][8 * 8]; +} DXVA_QmatrixData, *LPDXVA_QmatrixData; + +typedef struct _DXVA_SliceInfo { + USHORT wHorizontalPosition; + USHORT wVerticalPosition; + UINT dwSliceBitsInBuffer; + UINT dwSliceDataLocation; + UCHAR bStartCodeBitOffset; + UCHAR bReservedBits; + USHORT wMBbitOffset; + USHORT wNumberMBsInSlice; + USHORT wQuantizerScaleCode; + USHORT wBadSliceChopping; +} DXVA_SliceInfo, *LPDXVA_SliceInfo; + +/* DXVA H264 */ +typedef struct { +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + UCHAR Index7Bits : 7; + UCHAR AssociatedFlag : 1; + }; + UCHAR bPicEntry; + }; +#else + UCHAR bPicEntry; +#endif +} DXVA_PicEntry_H264; + + +typedef struct { + USHORT wFrameWidthInMbsMinus1; + USHORT wFrameHeightInMbsMinus1; + DXVA_PicEntry_H264 CurrPic; + UCHAR num_ref_frames; +#ifdef DXVA2API_USE_BITFIELDS + union { + struct { + USHORT field_pic_flag : 1; + USHORT MbaffFrameFlag : 1; + USHORT residual_colour_transform_flag : 1; + USHORT sp_for_switch_flag : 1; + USHORT chroma_format_idc : 2; + USHORT RefPicFlag : 1; + USHORT constrained_intra_pred_flag : 1; + USHORT weighted_pred_flag : 1; + USHORT weighted_bipred_idc : 2; + USHORT MbsConsecutiveFlag : 1; + USHORT frame_mbs_only_flag : 1; + USHORT transform_8x8_mode_flag : 1; + USHORT MinLumaBipredSize8x8Flag : 1; + USHORT IntraPicFlag : 1; + }; + USHORT wBitFields; + }; +#else + USHORT wBitFields; +#endif + UCHAR bit_depth_luma_minus8; + UCHAR bit_depth_chroma_minus8; + USHORT Reserved16Bits; + UINT StatusReportFeedbackNumber; + DXVA_PicEntry_H264 RefFrameList[16]; + INT CurrFieldOrderCnt[2]; + INT FieldOrderCntList[16][2]; + CHAR pic_init_qs_minus26; + CHAR chroma_qp_index_offset; + CHAR second_chroma_qp_index_offset; + UCHAR ContinuationFlag; + CHAR pic_init_qp_minus26; + UCHAR num_ref_idx_l0_active_minus1; + UCHAR num_ref_idx_l1_active_minus1; + UCHAR Reserved8BitsA; + USHORT FrameNumList[16]; + + UINT UsedForReferenceFlags; + USHORT NonExistingFrameFlags; + USHORT frame_num; + UCHAR log2_max_frame_num_minus4; + UCHAR pic_order_cnt_type; + UCHAR log2_max_pic_order_cnt_lsb_minus4; + UCHAR delta_pic_order_always_zero_flag; + UCHAR direct_8x8_inference_flag; + UCHAR entropy_coding_mode_flag; + UCHAR pic_order_present_flag; + UCHAR num_slice_groups_minus1; + UCHAR slice_group_map_type; + UCHAR deblocking_filter_control_present_flag; + UCHAR redundant_pic_cnt_present_flag; + UCHAR Reserved8BitsB; + USHORT slice_group_change_rate_minus1; + UCHAR SliceGroupMap[810]; +} DXVA_PicParams_H264; + +typedef struct { + UCHAR bScalingLists4x4[6][16]; + UCHAR bScalingLists8x8[2][64]; +} DXVA_Qmatrix_H264; + + +typedef struct { + UINT BSNALunitDataLocation; + UINT SliceBytesInBuffer; + USHORT wBadSliceChopping; + USHORT first_mb_in_slice; + USHORT NumMbsForSlice; + USHORT BitOffsetToSliceData; + UCHAR slice_type; + UCHAR luma_log2_weight_denom; + UCHAR chroma_log2_weight_denom; + + UCHAR num_ref_idx_l0_active_minus1; + UCHAR num_ref_idx_l1_active_minus1; + CHAR slice_alpha_c0_offset_div2; + CHAR slice_beta_offset_div2; + UCHAR Reserved8Bits; + DXVA_PicEntry_H264 RefPicList[2][32]; + SHORT Weights[2][32][3][2]; + CHAR slice_qs_delta; + CHAR slice_qp_delta; + UCHAR redundant_pic_cnt; + UCHAR direct_spatial_mv_pred_flag; + UCHAR cabac_init_idc; + UCHAR disable_deblocking_filter_idc; + USHORT slice_id; +} DXVA_Slice_H264_Long; + +typedef struct { + UINT BSNALunitDataLocation; + UINT SliceBytesInBuffer; + USHORT wBadSliceChopping; +} DXVA_Slice_H264_Short; + +typedef struct { + USHORT wFrameWidthInMbsMinus1; + USHORT wFrameHeightInMbsMinus1; + DXVA_PicEntry_H264 InPic; + DXVA_PicEntry_H264 OutPic; + USHORT PicOrderCnt_offset; + INT CurrPicOrderCnt; + UINT StatusReportFeedbackNumber; + UCHAR model_id; + UCHAR separate_colour_description_present_flag; + UCHAR film_grain_bit_depth_luma_minus8; + UCHAR film_grain_bit_depth_chroma_minus8; + UCHAR film_grain_full_range_flag; + UCHAR film_grain_colour_primaries; + UCHAR film_grain_transfer_characteristics; + UCHAR film_grain_matrix_coefficients; + UCHAR blending_mode_id; + UCHAR log2_scale_factor; + UCHAR comp_model_present_flag[4]; + UCHAR num_intensity_intervals_minus1[4]; + UCHAR num_model_values_minus1[4]; + UCHAR intensity_interval_lower_bound[3][16]; + UCHAR intensity_interval_upper_bound[3][16]; + SHORT comp_model_value[3][16][8]; +} DXVA_FilmGrainChar_H264; + +typedef struct { + union { + struct { + USHORT Fraction; + SHORT Value; + }; + LONG ll; + }; +}DXVA2_Fixed32; + +typedef struct { + UCHAR Cr; + UCHAR Cb; + UCHAR Y; + UCHAR Alpha; +}DXVA2_AYUVSample8; + +typedef struct { + USHORT Cr; + USHORT Cb; + USHORT Y; + USHORT Alpha; +}DXVA2_AYUVSample16; + +typedef struct { + DXVA2_Fixed32 MinValue; + DXVA2_Fixed32 MaxValue; + DXVA2_Fixed32 DefaultValue; + DXVA2_Fixed32 StepSize; +}DXVA2_ValueRange; + +typedef struct { + DXVA2_Fixed32 Brightness; + DXVA2_Fixed32 Contrast; + DXVA2_Fixed32 Hue; + DXVA2_Fixed32 Saturation; +}DXVA2_ProcAmpValues; + +typedef struct { + DXVA2_Fixed32 Level; + DXVA2_Fixed32 Threshold; + DXVA2_Fixed32 Radius; +}DXVA2_FilterValues; + +typedef struct { + UINT DeviceCaps; + D3DPOOL InputPool; + UINT NumForwardRefSamples; + UINT NumBackwardRefSamples; + UINT Reserved; + UINT DeinterlaceTechnology; + UINT ProcAmpControlCaps; + UINT VideoProcessorOperations; + UINT NoiseFilterTechnology; + UINT DetailFilterTechnology; +}DXVA2_VideoProcessorCaps; + +#ifndef _REFERENCE_TIME_ +#define _REFERENCE_TIME_ +typedef long long int64_t; +typedef int64_t REFERENCE_TIME; +#endif + +typedef struct { + REFERENCE_TIME Start; + REFERENCE_TIME End; + DXVA2_ExtendedFormat SampleFormat; + IDirect3DSurface9 *SrcSurface; + RECT SrcRect; + RECT DstRect; + DXVA2_AYUVSample8 Pal[16]; + DXVA2_Fixed32 PlanarAlpha; + DWORD SampleData; +}DXVA2_VideoSample; + + +typedef struct { + REFERENCE_TIME TargetFrame; + RECT TargetRect; + SIZE ConstrictionSize; + UINT StreamingFlags; + DXVA2_AYUVSample16 BackgroundColor; + DXVA2_ExtendedFormat DestFormat; + DXVA2_ProcAmpValues ProcAmpValues; + DXVA2_Fixed32 Alpha; + DXVA2_FilterValues NoiseFilterLuma; + DXVA2_FilterValues NoiseFilterChroma; + DXVA2_FilterValues DetailFilterLuma; + DXVA2_FilterValues DetailFilterChroma; + DWORD DestData; +} DXVA2_VideoProcessBltParams; + +#pragma pack(pop) + +/*************INTERFACES************/ +#ifdef __cplusplus +extern "C" { +#endif +#define _COM_interface struct +typedef _COM_interface IDirectXVideoDecoderService IDirectXVideoDecoderService; +typedef _COM_interface IDirectXVideoDecoder IDirectXVideoDecoder; + +#undef INTERFACE +#define INTERFACE IDirectXVideoDecoder +DECLARE_INTERFACE_( IDirectXVideoDecoder, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( GetVideoDecoderService ) ( THIS_ IDirectXVideoDecoderService** ) PURE; + STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9***, UINT* ) PURE; + STDMETHOD( GetBuffer ) ( THIS_ UINT, void**, UINT* ) PURE; + STDMETHOD( ReleaseBuffer ) ( THIS_ UINT ) PURE; + STDMETHOD( BeginFrame ) ( THIS_ IDirect3DSurface9 *, void* ) PURE; + STDMETHOD( EndFrame ) ( THIS_ HANDLE * ) PURE; + STDMETHOD( Execute ) ( THIS_ const DXVA2_DecodeExecuteParams* ) PURE; + + +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoDecoder_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoDecoder_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->lpVtbl->BeginFrame( p, a, b ) +#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->lpVtbl->EndFrame( p, a ) +#define IDirectXVideoDecoder_Execute( p, a ) (p)->lpVtbl->Execute( p, a ) +#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->lpVtbl->GetBuffer( p, a, b, c ) +#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d, e ) +#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->lpVtbl->GetVideoDecoderService( p, a ) +#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->lpVtbl->ReleaseBuffer( p, a ) +#else +#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoDecoder_AddRef( p ) (p)->AddRef() +#define IDirectXVideoDecoder_Release( p ) (p)->Release() +#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->BeginFrame( a, b ) +#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->EndFrame( a ) +#define IDirectXVideoDecoder_Execute( p, a ) (p)->Execute( a ) +#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->GetBuffer( a, b, c ) +#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->GetCreationParameters( a, b, c, d, e ) +#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->GetVideoDecoderService( a ) +#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->ReleaseBuffer( a ) +#endif + +#undef INTERFACE +#define INTERFACE IDirectXVideoAccelerationService +DECLARE_INTERFACE_( IDirectXVideoAccelerationService, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoAccelerationService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoAccelerationService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#else +#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoAccelerationService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoAccelerationService_Release( p ) (p)->Release() +#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#endif + +#undef INTERFACE +#define INTERFACE IDirectXVideoDecoderService +DECLARE_INTERFACE_( IDirectXVideoDecoderService, IDirectXVideoAccelerationService ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + STDMETHOD( GetDecoderDeviceGuids ) ( THIS_ UINT*, GUID ** ) PURE; + STDMETHOD( GetDecoderRenderTargets ) ( THIS_ REFGUID, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetDecoderConfigurations ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, IUnknown*, UINT*, DXVA2_ConfigPictureDecode** ) PURE; + STDMETHOD( CreateVideoDecoder ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9**, UINT, IDirectXVideoDecoder** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoDecoderService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoDecoderService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->lpVtbl->CreateVideoDecoder( p, a, b, c, d, e, f ) +#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->lpVtbl->GetDecoderConfigurations( p, a, b, c, d, e ) +#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->lpVtbl->GetDecoderDeviceGuids( p, a, b ) +#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->lpVtbl->GetDecoderRenderTargets( p, a, b, c ) +#else +#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoDecoderService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoDecoderService_Release( p ) (p)->Release() +#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->CreateVideoDecoder( a, b, c, d, e, f ) +#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->GetDecoderConfigurations( a, b, c, d, e ) +#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->GetDecoderDeviceGuids( a, b ) +#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->GetDecoderRenderTargets( a, b, c ) +#endif + +#undef INTERFACE +#define INTERFACE IDirect3DDeviceManager9 +DECLARE_INTERFACE_( IDirect3DDeviceManager9, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( ResetDevice ) ( THIS_ IDirect3DDevice9*, UINT ) PURE; + STDMETHOD( OpenDeviceHandle ) ( THIS_ HANDLE* ) PURE; + STDMETHOD( CloseDeviceHandle ) ( THIS_ HANDLE ) PURE; + STDMETHOD( TestDevice ) ( THIS_ HANDLE ) PURE; + STDMETHOD( LockDevice ) ( THIS_ HANDLE, IDirect3DDevice9**, BOOL ) PURE; + STDMETHOD( UnlockDevice ) ( THIS_ HANDLE, BOOL ) PURE; + STDMETHOD( GetVideoService ) ( THIS_ HANDLE, REFIID, void** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirect3DDeviceManager9_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirect3DDeviceManager9_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->lpVtbl->ResetDevice( p, a, b ) +#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->lpVtbl->OpenDeviceHandle( p, a ) +#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->lpVtbl->CloseDeviceHandle( p, a ) +#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->lpVtbl->TestDevice( p, a ) +#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->lpVtbl->LockDevice( p, a, b, c ) +#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->lpVtbl->UnlockDevice( p, a, b ) +#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->lpVtbl->GetVideoService( p, a, b, c ) +#else +#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirect3DDeviceManager9_AddRef( p ) (p)->AddRef() +#define IDirect3DDeviceManager9_Release( p ) (p)->Release() +#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->ResetDevice( a, b ) +#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->OpenDeviceHandle( a ) +#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->CloseDeviceHandle( a ) +#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->TestDevice( a ) +#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->LockDevice( a, b, c ) +#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->UnlockDevice( a, b ) +#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->GetVideoService( a, b, c ) +#endif + +typedef _COM_interface IDirectXVideoProcessorService IDirectXVideoProcessorService; +typedef _COM_interface IDirectXVideoProcessor IDirectXVideoProcessor; + +#undef INTERFACE +#define INTERFACE IDirectXVideoProcessor +DECLARE_INTERFACE_( IDirectXVideoProcessor, IUnknown ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( GetVideoProcessorService ) ( THIS_ IDirectXVideoProcessorService** ) PURE; + STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, D3DFORMAT*, UINT* ) PURE; + STDMETHOD( GetVideoProcessorCaps ) ( THIS_ DXVA2_VideoProcessorCaps* ) PURE; + STDMETHOD( GetProcAmpRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( GetFilterPropertyRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( VideoProcessBlt ) ( THIS_ IDirect3DSurface9*, DXVA2_VideoProcessBltParams*, DXVA2_VideoSample*, UINT, HANDLE* ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoProcessor_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoProcessor_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->lpVtbl->GetVideoProcessorService( p, a ) +#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d ) +#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->lpVtbl->GetVideoProcessorCaps( p, a ) +#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->lpVtbl->GetProcAmpRange( p, a, b ) +#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b ) +#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->lpVtbl->VideoProcessBlt( p, a, b, c, d, e ) +#else +#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoProcessor_AddRef( p ) (p)->AddRef() +#define IDirectXVideoProcessor_Release( p ) (p)->Release() +#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->GetVideoProcessorService( a ) +#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->GetCreationParameters( a, b, c, d ) +#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->GetVideoProcessorCaps( a ) +#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->GetProcAmpRange( a, b ) +#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->GetFilterPropertyRange( a, b ) +#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->VideoProcessBlt( a, b, c, d, e ) +#endif + + +#undef INTERFACE +#define INTERFACE IDirectXVideoProcessorService +DECLARE_INTERFACE_( IDirectXVideoProcessorService, IDirectXVideoAccelerationService ) +{ + STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE; + STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE; + STDMETHOD_( ULONG, Release ) ( THIS ) PURE; + STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE; + STDMETHOD( RegisterVideoProcessorSoftwareDevice ) ( THIS_ void* ) PURE; + STDMETHOD( GetVideoProcessorDeviceGuids ) ( THIS_ DXVA2_VideoDesc*, UINT, GUID** ) PURE; + STDMETHOD( GetVideoProcessorRenderTargets ) ( THIS_ REFGUID, DXVA2_VideoDesc*, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetVideoProcessorSubStreamFormats ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT*, D3DFORMAT** ) PURE; + STDMETHOD( GetVideoProcessorCaps ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, DXVA2_VideoProcessorCaps* ) PURE; + STDMETHOD( GetProcAmpRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( GetFilterPropertyRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE; + STDMETHOD( CreateVideoProcessor ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, IDirectXVideoProcessor** ) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b ) +#define IDirectXVideoProcessorService_AddRef( p ) (p)->lpVtbl->AddRef( p ) +#define IDirectXVideoProcessorService_Release( p ) (p)->lpVtbl->Release( p ) +#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->lpVtbl->RegisterVideoProcessorSoftwareDevice( p, a ) +#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->lpVtbl->GetVideoProcessorDeviceGuids( p, a, b, c ) +#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorRenderTargets( p, a, b, c, d ) +#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->lpVtbl->GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorCaps( p, a, b, c, d ) +#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->lpVtbl->GetProcAmpRange( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b, c, d, e ) +#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->lpVtbl->CreateVideoProcessor( p, a, b, c, d, e ) +#else +#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b ) +#define IDirectXVideoProcessorService_AddRef( p ) (p)->AddRef() +#define IDirectXVideoProcessorService_Release( p ) (p)->Release() +#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i ) +#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->RegisterVideoProcessorSoftwareDevice( a ) +#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->GetVideoProcessorDeviceGuids( a, b, c ) +#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->GetVideoProcessorRenderTargets( a, b, c, d ) +#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->GetVideoProcessorSubStreamFormats( a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->GetVideoProcessorCaps( a, b, c, d ) +#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->GetProcAmpRange( a, b, c, d, e ) +#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->GetFilterPropertyRange( a, b, c, d, e ) +#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->CreateVideoProcessor( a, b, c, d, e ) +#endif + + +/***************************************************************************************************** +************************DXVA Video Processor******************************************************** +*******************************************************************************************************/ + + + +/*#undef INTERFACE +#define INTERFACE IDirectXVideoService +DECLARE_INTERFACE_(IDirectXVideoService,IUnknown) +{ + STDMETHOD(DXVA2CreateVideoService)(IDirect3DDevice9*, REFIID, void**) PURE; +}; + +#if !defined(__cplusplus) || defined(CINTERFACE) +#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c) +#else +#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c) +#endif*/ + + +#ifdef __cplusplus +}; +#endif + +#ifdef __cplusplus +extern "C" HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *, + REFIID riid, + void **ppService ); +#else +extern HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *, + REFIID riid, + void **ppService ); +#endif + +typedef +enum _DXVA2_VideoChromaSubSampling +{ DXVA2_VideoChromaSubsamplingMask = 0xf, + DXVA2_VideoChromaSubsampling_Unknown = 0, + DXVA2_VideoChromaSubsampling_ProgressiveChroma = 0x8, + DXVA2_VideoChromaSubsampling_Horizontally_Cosited = 0x4, + DXVA2_VideoChromaSubsampling_Vertically_Cosited = 0x2, + DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes = 0x1, + DXVA2_VideoChromaSubsampling_MPEG2 = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes ), + DXVA2_VideoChromaSubsampling_MPEG1 = DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes, + DXVA2_VideoChromaSubsampling_DV_PAL = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ), + DXVA2_VideoChromaSubsampling_Cosited = ( ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ) | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes )} DXVA2_VideoChromaSubSampling; + +typedef +enum _DXVA2_NominalRange +{ DXVA2_NominalRangeMask = 0x7, + DXVA2_NominalRange_Unknown = 0, + DXVA2_NominalRange_Normal = 1, + DXVA2_NominalRange_Wide = 2, + DXVA2_NominalRange_0_255 = 1, + DXVA2_NominalRange_16_235 = 2, + DXVA2_NominalRange_48_208 = 3} DXVA2_NominalRange; + +typedef +enum _DXVA2_VideoLighting +{ DXVA2_VideoLightingMask = 0xf, + DXVA2_VideoLighting_Unknown = 0, + DXVA2_VideoLighting_bright = 1, + DXVA2_VideoLighting_office = 2, + DXVA2_VideoLighting_dim = 3, + DXVA2_VideoLighting_dark = 4} DXVA2_VideoLighting; + +typedef +enum _DXVA2_VideoPrimaries +{ DXVA2_VideoPrimariesMask = 0x1f, + DXVA2_VideoPrimaries_Unknown = 0, + DXVA2_VideoPrimaries_reserved = 1, + DXVA2_VideoPrimaries_BT709 = 2, + DXVA2_VideoPrimaries_BT470_2_SysM = 3, + DXVA2_VideoPrimaries_BT470_2_SysBG = 4, + DXVA2_VideoPrimaries_SMPTE170M = 5, + DXVA2_VideoPrimaries_SMPTE240M = 6, + DXVA2_VideoPrimaries_EBU3213 = 7, + DXVA2_VideoPrimaries_SMPTE_C = 8} DXVA2_VideoPrimaries; + +typedef +enum _DXVA2_VideoTransferFunction +{ DXVA2_VideoTransFuncMask = 0x1f, + DXVA2_VideoTransFunc_Unknown = 0, + DXVA2_VideoTransFunc_10 = 1, + DXVA2_VideoTransFunc_18 = 2, + DXVA2_VideoTransFunc_20 = 3, + DXVA2_VideoTransFunc_22 = 4, + DXVA2_VideoTransFunc_709 = 5, + DXVA2_VideoTransFunc_240M = 6, + DXVA2_VideoTransFunc_sRGB = 7, + DXVA2_VideoTransFunc_28 = 8} DXVA2_VideoTransferFunction; + +typedef +enum _DXVA2_SampleFormat +{ DXVA2_SampleFormatMask = 0xff, + DXVA2_SampleUnknown = 0, + DXVA2_SampleProgressiveFrame = 2, + DXVA2_SampleFieldInterleavedEvenFirst = 3, + DXVA2_SampleFieldInterleavedOddFirst = 4, + DXVA2_SampleFieldSingleEven = 5, + DXVA2_SampleFieldSingleOdd = 6, + DXVA2_SampleSubStream = 7} DXVA2_SampleFormat; + +typedef +enum _DXVA2_VideoTransferMatrix +{ DXVA2_VideoTransferMatrixMask = 0x7, + DXVA2_VideoTransferMatrix_Unknown = 0, + DXVA2_VideoTransferMatrix_BT709 = 1, + DXVA2_VideoTransferMatrix_BT601 = 2, + DXVA2_VideoTransferMatrix_SMPTE240M = 3} DXVA2_VideoTransferMatrix; + +enum __MIDL___MIDL_itf_dxva2api_0000_0000_0004 +{ DXVA2_NoiseFilterLumaLevel = 1, + DXVA2_NoiseFilterLumaThreshold = 2, + DXVA2_NoiseFilterLumaRadius = 3, + DXVA2_NoiseFilterChromaLevel = 4, + DXVA2_NoiseFilterChromaThreshold = 5, + DXVA2_NoiseFilterChromaRadius = 6, + DXVA2_DetailFilterLumaLevel = 7, + DXVA2_DetailFilterLumaThreshold = 8, + DXVA2_DetailFilterLumaRadius = 9, + DXVA2_DetailFilterChromaLevel = 10, + DXVA2_DetailFilterChromaThreshold = 11, + DXVA2_DetailFilterChromaRadius = 12}; + +enum __MIDL___MIDL_itf_dxva2api_0000_0000_0008 +{ DXVA2_VideoProcess_None = 0, + DXVA2_VideoProcess_YUV2RGB = 0x1, + DXVA2_VideoProcess_StretchX = 0x2, + DXVA2_VideoProcess_StretchY = 0x4, + DXVA2_VideoProcess_AlphaBlend = 0x8, + DXVA2_VideoProcess_SubRects = 0x10, + DXVA2_VideoProcess_SubStreams = 0x20, + DXVA2_VideoProcess_SubStreamsExtended = 0x40, + DXVA2_VideoProcess_YUV2RGBExtended = 0x80, + DXVA2_VideoProcess_AlphaBlendExtended = 0x100, + DXVA2_VideoProcess_Constriction = 0x200, + DXVA2_VideoProcess_NoiseFilter = 0x400, + DXVA2_VideoProcess_DetailFilter = 0x800, + DXVA2_VideoProcess_PlanarAlpha = 0x1000, + DXVA2_VideoProcess_LinearScaling = 0x2000, + DXVA2_VideoProcess_GammaCompensated = 0x4000, + DXVA2_VideoProcess_MaintainsOriginalFieldData = 0x8000, + DXVA2_VideoProcess_Mask = 0xffff}; + + + +__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ ); + +__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha(); + +__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ ); +#endif +#endif //_DXVA2API_H diff --git a/libhb/fifo.c b/libhb/fifo.c index be69616f1..bd1a45e40 100644 --- a/libhb/fifo.c +++ b/libhb/fifo.c @@ -242,7 +242,14 @@ void hb_buffer_pool_free( void ) if( b->data ) { freed += b->alloc; - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); } free( b ); count++; @@ -273,7 +280,7 @@ static hb_fifo_t *size_to_pool( int size ) return NULL; } -hb_buffer_t * hb_buffer_init( int size ) +hb_buffer_t * hb_buffer_init_internal( int size , int needsMapped ) { hb_buffer_t * b; // Certain libraries (hrm ffmpeg) expect buffers passed to them to @@ -288,6 +295,18 @@ hb_buffer_t * hb_buffer_init( int size ) { b = hb_fifo_get( buffer_pool ); +#ifdef USE_OPENCL + if (b && (needsMapped != 0) && (b->cl.buffer == NULL)) + { + // We need a mapped OpenCL buffer and that is not what we got out of the pool. + // Ditch it. It will get replaced with what we need. + if (b->data) + free(b->data); + free(b); + b = NULL; + } +#endif + if( b ) { /* @@ -295,6 +314,12 @@ hb_buffer_t * hb_buffer_init( int size ) * didn't have to do this. */ uint8_t *data = b->data; +#ifdef USE_OPENCL + cl_mem buffer = b->cl.buffer; + cl_event last_event = b->cl.last_event; + int loc = b->cl.buffer_location; +#endif + memset( b, 0, sizeof(hb_buffer_t) ); b->alloc = buffer_pool->buffer_size; b->size = size; @@ -302,6 +327,11 @@ hb_buffer_t * hb_buffer_init( int size ) b->s.start = -1; b->s.stop = -1; b->s.renderOffset = -1; +#ifdef USE_OPENCL + b->cl.buffer = buffer; + b->cl.last_event = last_event; + b->cl.buffer_location = loc; +#endif return( b ); } } @@ -320,6 +350,20 @@ hb_buffer_t * hb_buffer_init( int size ) if (size) { +#ifdef USE_OPENCL + b->cl.last_event = NULL; + b->cl.buffer_location = HOST; + + if (needsMapped != 0) + { + int status; + status = hb_cl_create_mapped_buffer(&b->cl.buffer, &b->data, b->alloc); + //hb_log("buf: %.16x -> buffer %.16x map %.16x size %d", b, b->cl.buffer, b->data, size); + } + else { + b->cl.buffer = NULL; +#endif + #if defined( SYS_DARWIN ) || defined( SYS_FREEBSD ) || defined( SYS_MINGW ) b->data = malloc( b->alloc ); #elif defined( SYS_CYGWIN ) @@ -328,6 +372,10 @@ hb_buffer_t * hb_buffer_init( int size ) #else b->data = memalign( 16, b->alloc ); #endif +#ifdef USE_OPENCL + } +#endif + if( !b->data ) { hb_log( "out of memory" ); @@ -344,6 +392,11 @@ hb_buffer_t * hb_buffer_init( int size ) return b; } +hb_buffer_t * hb_buffer_init( int size ) +{ + return hb_buffer_init_internal(size, 0); +} + void hb_buffer_realloc( hb_buffer_t * b, int size ) { if ( size > b->alloc || b->data == NULL ) @@ -361,6 +414,7 @@ void hb_buffer_realloc( hb_buffer_t * b, int size ) void hb_buffer_reduce( hb_buffer_t * b, int size ) { + if ( size < b->alloc / 8 || b->data == NULL ) { hb_buffer_t * tmp = hb_buffer_init( size ); @@ -374,6 +428,7 @@ void hb_buffer_reduce( hb_buffer_t * b, int size ) hb_buffer_t * hb_buffer_dup( const hb_buffer_t * src ) { + hb_buffer_t * buf; if ( src == NULL ) @@ -470,8 +525,11 @@ hb_buffer_t * hb_frame_buffer_init( int pix_fmt, int width, int height ) hb_image_height_stride( pix_fmt, height, p ); } } - +#ifdef USE_OPENCL + buf = hb_buffer_init_internal( size , hb_use_buffers() ); +#else buf = hb_buffer_init( size ); +#endif if( buf == NULL ) return NULL; @@ -524,12 +582,22 @@ void hb_buffer_swap_copy( hb_buffer_t *src, hb_buffer_t *dst ) uint8_t *data = dst->data; int size = dst->size; int alloc = dst->alloc; +#ifdef USE_OPENCL + cl_mem buffer = dst->cl.buffer; + cl_event last_event = dst->cl.last_event; + int loc = dst->cl.buffer_location; +#endif *dst = *src; src->data = data; src->size = size; src->alloc = alloc; +#ifdef USE_OPENCL + src->cl.buffer = buffer; + src->cl.last_event = last_event; + src->cl.buffer_location = loc; +#endif } // Frees the specified buffer list. @@ -557,7 +625,14 @@ void hb_buffer_close( hb_buffer_t ** _b ) // free the buf if( b->data ) { - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free2: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); hb_lock(buffers.lock); buffers.allocated -= b->alloc; hb_unlock(buffers.lock); diff --git a/libhb/hb.c b/libhb/hb.c index a9d9e417b..f6e3cb0e0 100644 --- a/libhb/hb.c +++ b/libhb/hb.c @@ -440,6 +440,11 @@ hb_handle_t * hb_init( int verbose, int update_check ) h->interjob = calloc( sizeof( hb_interjob_t ), 1 ); + /* opencl */ +#ifdef USE_OPENCL + //hb_opencl_init(); // FIXME: Ensure gui instances call this or hb_get_opencl_env() during startup if needed. +#endif + /* Start library thread */ hb_log( "hb_init: starting libhb thread" ); h->die = 0; diff --git a/libhb/hbffmpeg.h b/libhb/hbffmpeg.h index 36319de09..776eec61a 100644 --- a/libhb/hbffmpeg.h +++ b/libhb/hbffmpeg.h @@ -15,6 +15,7 @@ #include "libavutil/opt.h" #include "libswscale/swscale.h" #include "libavresample/avresample.h" +#include "common.h" #define HB_FFMPEG_THREADS_AUTO (-1) // let hb_avcodec_open() decide thread_count diff --git a/libhb/internal.h b/libhb/internal.h index 86878eb17..e4d5ed71c 100644 --- a/libhb/internal.h +++ b/libhb/internal.h @@ -121,6 +121,15 @@ struct hb_buffer_s void *filter_details; } qsv_details; +#ifdef USE_OPENCL + struct cl_data + { + cl_mem buffer; + cl_event last_event; + enum { HOST, DEVICE } buffer_location; + } cl; +#endif + // PICTURESUB subtitle packets: // Video packets (after processing by the hb_sync_video work-object): diff --git a/libhb/module.defs b/libhb/module.defs index b161f4c7f..e06c6331b 100644 --- a/libhb/module.defs +++ b/libhb/module.defs @@ -45,6 +45,14 @@ endif ifeq (1,$(FEATURE.fdk_aac)) LIBHB.GCC.D += USE_FDK_AAC endif +ifeq (1,$(FEATURE.opencl)) +LIBHB.GCC.D += USE_OPENCL +endif + +ifeq (1,$(FEATURE.hwd)) +LIBHB.GCC.D += USE_HWD +endif + ifeq (1,$(FEATURE.libav_aac)) LIBHB.GCC.D += USE_LIBAV_AAC endif @@ -63,6 +71,11 @@ endif LIBHB.GCC.D += __LIBHB__ USE_PTHREAD LIBHB.GCC.I += $(LIBHB.build/) $(CONTRIB.build/)include +ifeq (1,$(FEATURE.opencl)) +LIBHB.GCC.I += $(AMDAPPSDKROOT)/include +LIBHB.GCC.l += OpenCL +endif + ifeq ($(BUILD.system),cygwin) LIBHB.GCC.D += SYS_CYGWIN else ifeq ($(BUILD.system),darwin) diff --git a/libhb/oclnv12toyuv.c b/libhb/oclnv12toyuv.c new file mode 100644 index 000000000..19188e5a3 --- /dev/null +++ b/libhb/oclnv12toyuv.c @@ -0,0 +1,285 @@ +/* oclnv12toyuv.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + */ + +#ifdef USE_OPENCL +#ifdef USE_HWD +#include "vadxva2.h" +#include "oclnv12toyuv.h" + +/** + * It creates are opencl bufs w is input frame width, h is input frame height +*/ +static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * It creates are opencl kernel. kernel name is nv12toyuv +*/ +static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ); + +/** + * It set opencl arg, input data,output data, input width, output height +*/ +static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * It initialize nv12 to yuv kernel. +*/ +static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ); + +/** + * Run nv12 to yuv kernel. + */ +static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ); + +/** + * register nv12 to yuv kernel. + */ +static int hb_nv12toyuv_reg_kernel( void ); + +/** + * It creates are opencl bufs w is input frame width, h is input frame height + */ +static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + cl_int status = CL_SUCCESS; + int in_bytes = w*h*3/2; + CREATEBUF( dxva2->cl_mem_nv12, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, in_bytes ); + CREATEBUF( dxva2->cl_mem_yuv, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, in_bytes ); + return 0; +} + +/** + * It creates are opencl kernel. kernel name is nv12toyuv + */ +static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 ) +{ + int ret; + dxva2->nv12toyuv = clCreateKernel( kenv->program, "nv12toyuv", &ret ); + return ret; +} + +/** + * It set opencl arg, input data,output data, input width, output height + */ +static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + int arg = 0, status; + kenv->kernel = dxva2->nv12toyuv; + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_nv12 ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_yuv ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &w ); + OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &h ); + return 0; +} + +/** + * It initialize nv12 to yuv kernel. + */ +static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 ) +{ + if( !dxva2->nv12toyuv ) + { + if( hb_nv12toyuv_create_cl_buf( kenv, w, h, dxva2 ) ) + { + hb_log( "OpenCL: nv12toyuv_create_cl_buf fail" ); + return -1; + } + if (!dxva2->nv12toyuv_tmp_in) + { + dxva2->nv12toyuv_tmp_in = malloc (w*h*3/2); + } + + if (!dxva2->nv12toyuv_tmp_out) + { + dxva2->nv12toyuv_tmp_out = malloc (w*h*3/2); + } + + hb_nv12toyuv_create_cl_kernel( kenv, dxva2 ); + } + return 0; +} + +/** + * copy_plane + * @param dst - + * @param src - + * @param dstride - + * @param sstride - + * @param h - + */ +static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, + int h ) +{ + if ( dstride == sstride ) + { + memcpy( dst, src, dstride * h ); + return dst + dstride * h; + } + + int lbytes = dstride <= sstride? dstride : sstride; + while ( --h >= 0 ) + { + memcpy( dst, src, lbytes ); + src += sstride; + dst += dstride; + } + + return dst; +} + +/** + * Run nv12 to yuv kernel. + */ +static int hb_nv12toyuv( void **userdata, KernelEnv *kenv ) +{ + int status; + int w = (int)userdata[0]; + int h = (int)userdata[1]; + uint8_t *bufi1 = userdata[2]; + int *crop = userdata[3]; + hb_va_dxva2_t *dxva2 = userdata[4]; + + uint8_t *bufi2 = userdata[5]; + int p = (int)userdata[6]; + int decomb = (int)userdata[7]; + int detelecine = (int)userdata[8]; + int i; + if( hb_init_nv12toyuv_ocl( kenv, w, h, dxva2 ) ) + { + return -1; + } + + if( hb_nv12toyuv_setkernelarg( kenv, w, h, dxva2 ) ) + { + return -1; + } + + int in_bytes = w*h*3/2; + if( kenv->isAMD ) + { + void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_nv12, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, in_bytes, 0, NULL, NULL, NULL ); + + for ( i = 0; i < dxva2->height; i++ ) + { + memcpy( data + i * dxva2->width, bufi1 + i * p, dxva2->width ); + if ( i < dxva2->height >> 1 ) + { + memcpy( data + ( dxva2->width * dxva2->height ) + i * dxva2->width, bufi2 + i * p, dxva2->width ); + } + } + clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_nv12, data, 0, NULL, NULL ); + } + else + { + uint8_t *tmp = (uint8_t*)malloc( dxva2->width * dxva2->height * 3 / 2 ); + for( i = 0; i < dxva2->height; i++ ) + { + memcpy( tmp + i * dxva2->width, bufi1 + i * p, dxva2->width ); + if( i < dxva2->height >> 1 ) + { + memcpy( tmp + (dxva2->width * dxva2->height) + i * dxva2->width, bufi2 + i * p, dxva2->width ); + } + } + OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_nv12, CL_TRUE, 0, in_bytes, tmp, 0, NULL, NULL ); + free( tmp ); + } + + size_t gdim[2] = {w>>1, h>>1}; + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); + + if( (crop[0] || crop[1] || crop[2] || crop[3]) && (decomb == 0) && (detelecine == 0) ) + { + AVPicture pic_in; + AVPicture pic_crop; + clEnqueueReadBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_out, 0, NULL, NULL ); + hb_buffer_t *in = hb_video_buffer_init( w, h ); + + int wmp = in->plane[0].stride; + int hmp = in->plane[0].height; + copy_plane( in->plane[0].data, dxva2->nv12toyuv_tmp_out, wmp, w, hmp ); + wmp = in->plane[1].stride; + hmp = in->plane[1].height; + copy_plane( in->plane[1].data, dxva2->nv12toyuv_tmp_out + w * h, wmp, w>>1, hmp ); + wmp = in->plane[2].stride; + hmp = in->plane[2].height; + copy_plane( in->plane[2].data, dxva2->nv12toyuv_tmp_out + w * h +( ( w * h )>>2 ), wmp, w>>1, hmp ); + + hb_avpicture_fill( &pic_in, in ); + av_picture_crop( &pic_crop, &pic_in, in->f.fmt, crop[0], crop[2] ); + int i, ww = w - ( crop[2] + crop[3] ), hh = h - ( crop[0] + crop[1] ); + for( i = 0; i< hh >> 1; i++ ) + { + memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 0 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], ww ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 1 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], ww ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + i * ( ww >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], ww >> 1 ); + memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + ( ( ww * hh )>>2 ) + i * ( ww >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], ww >> 1 ); + } + + if( kenv->isAMD ) + { + void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, ww * hh * 3 / 2, 0, NULL, NULL, NULL ); + memcpy( data, dxva2->nv12toyuv_tmp_in, ww * hh * 3 / 2 ); + clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_yuv, data, 0, NULL, NULL ); + } + else + { + OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_in, 0, NULL, NULL ); + } + + hb_buffer_close( &in ); + } + return 0; +} +/** + * register nv12 to yuv kernel. + */ +static int hb_nv12toyuv_reg_kernel( void ) +{ + int st = hb_register_kernel_wrapper( "nv12toyuv", hb_nv12toyuv ); + if( !st ) + { + hb_log( "OpenCL: register kernel[%s] failed", "nv12toyuv" ); + return -1; + } + return 0; +} +/** + * nv12 to yuv interface + * bufi is input frame of nv12, w is input frame width, h is input frame height + */ +int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine ) +{ + void *userdata[9]; + userdata[0] = (void*)w; + userdata[1] = (void*)h; + userdata[2] = bufi[0]; + userdata[3] = crop; + userdata[4] = dxva2; + userdata[5] = bufi[1]; + userdata[6] = (void*)p; + userdata[7] = decomb; + userdata[8] = detelecine; + + if( hb_nv12toyuv_reg_kernel() ) + { + return -1; + } + + if( hb_run_kernel( "nv12toyuv", userdata ) ) + { + hb_log( "OpenCL: run kernel[nv12toyuv] failed" ); + return -1; + } + return 0; +} +#endif +#endif diff --git a/libhb/oclnv12toyuv.h b/libhb/oclnv12toyuv.h new file mode 100644 index 000000000..86f7673bb --- /dev/null +++ b/libhb/oclnv12toyuv.h @@ -0,0 +1,35 @@ +/* oclnv12toyuv.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + +#ifdef USE_OPENCL +#ifndef RENDER_CL_H +#define RENDER_CL_H + +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#include "common.h" +#include "openclwrapper.h" + +/** + * nv12 to yuv interface + * bufi is input frame of nv12, w is input frame width, h is input frame height + */ +#ifdef USE_HWD +int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine ); +#endif +#endif +#endif diff --git a/libhb/oclscale.c b/libhb/oclscale.c new file mode 100644 index 000000000..904183340 --- /dev/null +++ b/libhb/oclscale.c @@ -0,0 +1,271 @@ +/* oclscale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifdef USE_OPENCL
+
+#include <math.h> +#include "common.h" +#include "openclwrapper.h" +#define FILTER_LEN 4 + +#define _A -0.5f + +cl_float cubic(cl_float x) +{ + if (x < 0) + x = -x; + + if (x < 1) + return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1; + else if (x < 2) + return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A); + else + return 0; +} + + +cl_float *hb_bicubic_weights(cl_float scale, int length) +{ + cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4); + + int i; // C rocks + cl_float *out = weights; + for (i = 0; i < length; ++i) + { + cl_float x = i / scale; + cl_float dx = x - (int)x; + *out++ = cubic(-dx - 1.0f); + *out++ = cubic(-dx); + *out++ = cubic(-dx + 1.0f); + *out++ = cubic(-dx + 2.0f); + } + return weights; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv); + +/** +* executive scale using opencl +* get filter args +* create output buffer +* create horizontal filter buffer +* create vertical filter buffer +* create kernels +*/ +int hb_ocl_scale_func( void **data, KernelEnv *kenv ) +{ + cl_int status; + + cl_mem in_buf = data[0]; + cl_mem out_buf = data[1]; + int crop_top = data[2]; + int crop_bottom = data[3]; + int crop_left = data[4]; + int crop_right = data[5]; + int in_frame_w = (int)data[6]; + int in_frame_h = (int)data[7]; + int out_frame_w = (int)data[8]; + int out_frame_h = (int)data[9]; + hb_oclscale_t *os = data[10]; + hb_buffer_t *in = data[11]; + hb_buffer_t *out = data[12]; + + if (os->initialized == 0) + { + hb_log( "Scaling With OpenCL" ); + if (kenv->isAMD != 0) + hb_log( "Using Zero Copy"); + // create the block kernel + cl_int status; + os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status ); + + os->initialized = 1; + } + + { + // Use the new kernel + cl_event events[5]; + int eventCount = 0; + + if (kenv->isAMD == 0) { + status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]); + status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]); + } + + cl_int srcPlaneOffset0 = in->plane[0].data - in->data; + cl_int srcPlaneOffset1 = in->plane[1].data - in->data; + cl_int srcPlaneOffset2 = in->plane[2].data - in->data; + cl_int srcRowWords0 = in->plane[0].stride; + cl_int srcRowWords1 = in->plane[1].stride; + cl_int srcRowWords2 = in->plane[2].stride; + cl_int dstPlaneOffset0 = out->plane[0].data - out->data; + cl_int dstPlaneOffset1 = out->plane[1].data - out->data; + cl_int dstPlaneOffset2 = out->plane[2].data - out->data; + cl_int dstRowWords0 = out->plane[0].stride; + cl_int dstRowWords1 = out->plane[1].stride; + cl_int dstRowWords2 = out->plane[2].stride; + + if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) { + srcPlaneOffset0 += crop_left + crop_top * srcRowWords0; + srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1; + srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2; + in_frame_w = in_frame_w - crop_right - crop_left; + in_frame_h = in_frame_h - crop_bottom - crop_top; + } + + cl_float xscale = (out_frame_w * 1.0f) / in_frame_w; + cl_float yscale = (out_frame_h * 1.0f) / in_frame_h; + setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv); + + OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch + OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights ); + OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights ); + + size_t workOffset[] = { 0, 0, 0 }; + size_t globalWorkSize[] = { 1, 1, 1 }; + size_t localWorkSize[] = { 1, 1, 1 }; + + int xgroups = (out_frame_w + 63) / 64; + int ygroups = (out_frame_h + 15) / 16; + + localWorkSize[0] = 64; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + globalWorkSize[0] = xgroups * 64; + globalWorkSize[1] = ygroups; + globalWorkSize[2] = 3; + + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] ); + ++eventCount; + + if (kenv->isAMD == 0) { + in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status); + out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status); + eventCount += 2; + } + + clFlush(kenv->command_queue); + clWaitForEvents(eventCount, &events[0]); + int i; + for (i = 0; i < eventCount; ++i) + clReleaseEvent(events[i]); + } + + return 1; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { + cl_int status; + if (os->xscale != xscale || os->width < width) { + cl_float *xweights = hb_bicubic_weights(xscale, width); + CL_FREE(os->bicubic_x_weights); + CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); + os->width = width; + os->xscale = xscale; + free(xweights); + } + + if ((os->yscale != yscale) || (os->height < height)) { + cl_float *yweights = hb_bicubic_weights(yscale, height); + CL_FREE(os->bicubic_y_weights); + CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); + os->height = height; + os->yscale = yscale; + free(yweights); + } + return 0; +} + + +/** +* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm +* parameter: +* inputFrameBuffer: the source video frame opencl buffer
+* outputdata: the destination video frame buffer
+* inputWidth: the width of the source video frame
+* inputHeight: the height of the source video frame
+* outputWidth: the width of destination video frame +* outputHeight: the height of destination video frame +*/ + + +static int s_scale_init_flag = 0; + +int do_scale_init() +{ + if ( s_scale_init_flag==0 ) + { + int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func ); + if( !st ) + { + hb_log( "register kernel[%s] failed", "frame_scale" ); + return 0; + } + s_scale_init_flag++; + } + return 1; +} + + +int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os) +{ + void *data[13]; + + if (do_scale_init() == 0) + return 0; + + data[0] = in->cl.buffer; + data[1] = out->cl.buffer; + data[2] = (void*)(crop[0]); + data[3] = (void*)(crop[1]); + data[4] = (void*)(crop[2]); + data[5] = (void*)(crop[3]); + data[6] = (void*)(in->f.width); + data[7] = (void*)(in->f.height); + data[8] = (void*)(out->f.width); + data[9] = (void*)(out->f.height); + data[10] = os; + data[11] = in; + data[12] = out; + + if( !hb_run_kernel( "frame_scale", data ) ) + hb_log( "run kernel[%s] failed", "frame_scale" ); + return 0; +} + + + + + +#endif diff --git a/libhb/openclkernels.h b/libhb/openclkernels.h new file mode 100644 index 000000000..f324aafdd --- /dev/null +++ b/libhb/openclkernels.h @@ -0,0 +1,771 @@ +/* openclkernels.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifndef USE_EXTERNAL_KERNEL
+
+#define KERNEL( ... )# __VA_ARGS__
+
+
+char *kernel_src_hscale = KERNEL (
+
+ typedef unsigned char fixed8;
+
+/*******************************************************************************************************
+dst: Horizontal scale destination;
+src: YUV content in opencl buf;
+hf_Y: Horizontal filter coefficients for Y planes;
+hf_UV: Horizontal filter coefficients for UV planes;
+hi_Y: Horizontal filter index for Y planes;
+hi_UV: Horizontal filter index for UV planes;
+stride: Src width;
+filter_len: Length of filter;
+********************************************************************************************************/
+ kernel void frame_h_scale (
+ global fixed8 *src,
+ global float *hf_Y,
+ global float *hf_UV,
+ global int *hi_Y,
+ global int *hi_UV,
+ global fixed8 *dst,
+ int stride, //src_width
+ int filter_len
+ )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int width = get_global_size( 0 );
+ int height = get_global_size( 1 );
+ float result_Y = 0, result_U = 0, result_V = 0;
+ int i = 0;
+
+ global fixed8 *src_Y = src;
+ global fixed8 *src_U = src_Y + stride * height;
+ global fixed8 *src_V = src_U + (stride >> 1) * (height >> 1);
+
+ global fixed8 *dst_Y = dst;
+ global fixed8 *dst_U = dst_Y + width * height;
+ global fixed8 *dst_V = dst_U + (width >> 1) * (height >> 1);
+
+ int xy = y * width + x;
+ global fixed8 *rowdata_Y = src_Y + (y * stride);
+ for( int i = 0; i < filter_len; i++ )
+ {
+ result_Y += ( hf_Y[x + i * width] * rowdata_Y[hi_Y[x] + i]);
+ }
+ dst_Y[xy] = result_Y;
+
+ if( y < (height >> 1) && x < (width >> 1) )
+ {
+ int xy = y * (width >> 1) + x;
+ global fixed8 *rowdata_U = src_U + (y * (stride >> 1));
+ global fixed8 *rowdata_V = src_V + (y * (stride >> 1));
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_U += ( hf_UV[x + i * (width >> 1)] * rowdata_U[hi_UV[x] + i]);
+ result_V += ( hf_UV[x + i * (width >> 1)] * rowdata_V[hi_UV[x] + i]);
+ }
+ dst_U[xy] = result_U;
+ dst_V[xy] = result_V;
+ }
+ }
+ );
+
+/*******************************************************************************************************
+dst: Vertical scale destination;
+src: YUV content in opencl buf;
+hf_Y: Vertical filter coefficients for Y planes;
+hf_UV: Vertical filter coefficients for UV planes;
+hi_Y: Vertical filter index for Y planes;
+hi_UV: Vertical filter index for UV planes;
+stride: Src height;
+filter_len: Length of filter;
+********************************************************************************************************/
+char *kernel_src_vscale = KERNEL (
+
+ kernel void frame_v_scale (
+ global fixed8 *src,
+ global float *vf_Y,
+ global float *vf_UV,
+ global int *vi_Y,
+ global int *vi_UV,
+ global fixed8 *dst,
+ int src_height,
+ int filter_len
+ )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int width = get_global_size( 0 );
+ int height = get_global_size( 1 );
+ float result_Y = 0, result_U = 0, result_V = 0;
+ int i = 0;
+
+ global fixed8 *src_Y = src;
+ global fixed8 *src_U = src_Y + src_height * width;
+ global fixed8 *src_V = src_U + (src_height >> 1) * (width >> 1);
+
+ global fixed8 *dst_Y = dst;
+ global fixed8 *dst_U = dst_Y + height * width;
+ global fixed8 *dst_V = dst_U + (height >> 1) * (width >> 1);
+
+ int xy = y * width + x;
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_Y += vf_Y[y + i * height] * src_Y[(vi_Y[y] + i) * width + x];
+ }
+ dst_Y[xy] = result_Y;
+
+ if( y < (height >> 1) && x < (width >> 1) )
+ {
+ int xy = y * (width >> 1) + x;
+ for( i = 0; i < filter_len; i++ )
+ {
+ result_U += vf_UV[y + i * (height >> 1)] * src_U[(vi_UV[y] + i) * (width >> 1) + x];
+ result_V += vf_UV[y + i * (height >> 1)] * src_V[(vi_UV[y] + i) * (width >> 1) + x];
+ }
+ dst_U[xy] = result_U;
+ dst_V[xy] = result_V;
+ }
+ }
+ );
+
+/*******************************************************************************************************
+input: Input buffer;
+output: Output buffer;
+w: Width of frame;
+h: Height of frame;
+********************************************************************************************************/
+char *kernel_src_nvtoyuv = KERNEL (
+
+ kernel void nv12toyuv ( global char *input, global char* output, int w, int h )
+ {
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int idx = y * (w >> 1) + x;
+ vstore4((vload4( 0, input + (idx << 2))), 0, output + (idx << 2)); //Y
+ char2 uv = vload2( 0, input + (idx << 1) + w * h );
+ output[idx + w * h] = uv.s0;
+ output[idx + w * h + ((w * h) >> 2)] = uv.s1;
+ }
+ );
+
+/*******************************************************************************************************
+dst: Horizontal scale destination;
+src: YUV content in opencl buf;
+yfilter: Opencl memory of horizontal filter coefficients for luma/alpha planes;
+yfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for luma/alpha planes;
+yfilterSize: Horizontal filter size for luma/alpha pixels;
+cfilter: Opencl memory of horizontal filter coefficients for chroma planes;
+cfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for chroma planes;
+cfilterSize: Horizontal filter size for chroma pixels;
+dstStride: Width of destination luma/alpha planes;
+dstChrStride: Width of destination chroma planes;
+********************************************************************************************************/
+
+char *kernel_src_hscaleall = KERNEL (
+
+ kernel void hscale_all_opencl (
+ global short *dst,
+ const global unsigned char *src,
+ const global short *yfilter,
+ const global int *yfilterPos,
+ int yfilterSize,
+ const global short *cfilter,
+ const global int *cfilterPos,
+ int cfilterSize,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+
+ int srcPos1 = h * srcStride + yfilterPos[w];
+ int srcPos2 = h * srcStride + yfilterPos[w + chrWidth];
+ int srcPos3 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w];
+ int srcPos4 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w + chrWidth];
+ int srcc1Pos = srcStride * srcHeight + (h) * (srcChrStride) + cfilterPos[w];
+ int srcc2Pos = srcc1Pos + ((srcChrStride)*(chrHeight));
+
+ int val1 = 0;
+ int val2 = 0;
+ int val3 = 0;
+ int val4 = 0;
+ int val5 = 0;
+ int val6 = 0;
+
+ int filterPos1 = yfilterSize * w;
+ int filterPos2 = yfilterSize * (w + chrWidth);
+ int cfilterPos1 = cfilterSize * w;
+
+ int j;
+ for (j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1 + j] * yfilter[filterPos1+ j];
+ val2 += src[srcPos2 + j] * yfilter[filterPos2 + j];
+ val3 += src[srcPos3 + j] * yfilter[filterPos1 + j];
+ val4 += src[srcPos4 + j] * yfilter[filterPos2 + j];
+ val5 += src[srcc1Pos+j] * cfilter[cfilterPos1 + j];
+ val6 += src[srcc2Pos+j] * cfilter[cfilterPos1 + j];
+ }
+ int dstPos1 = h *dstStride;
+ int dstPos2 = (h + chrHeight) * dstStride;
+
+ dst[dstPos1 + w] = ((val1 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val1 >> 7));
+ dst[dstPos1 + w + chrWidth] = ((val2 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val2 >> 7));
+ dst[dstPos2 + w] = ((val3 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val3 >> 7));
+ dst[dstPos2 + w + chrWidth] = ((val4 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val4 >> 7));
+
+ int dstPos3 = h * (dstChrStride) + w + dstStride * dstHeight;
+ int dstPos4 = h * (dstChrStride) + w + dstStride * dstHeight + ((dstChrStride) * chrHeight);
+ dst[dstPos3] = ((val5 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val5 >> 7));
+ dst[dstPos4] = ((val6 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val6 >> 7));
+ }
+ );
+
+char *kernel_src_hscalefast = KERNEL (
+
+ kernel void hscale_fast_opencl (
+ global short *dst,
+ const global unsigned char *src,
+ int xInc,
+ int chrXInc,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ int xpos1 = 0;
+ int xpos2 = 0;
+ int xx = xpos1 >> 16;
+ int xalpha = (xpos1 & 0xFFFF) >> 9;
+ dst[h * dstStride + w] = (src[h * srcStride + xx] << 7) + (src[h * srcStride + xx + 1] -src[h * srcStride + xx]) * xalpha;
+ int lowpart = h + (chrHeight);
+ dst[lowpart * dstStride + w] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
+
+ int inv_i = w * xInc >> 16;
+ if( inv_i >= srcWidth - 1)
+ {
+ dst[h*dstStride + w] = src[h*srcStride + srcWidth-1]*128;
+ dst[lowpart*dstStride + w] = src[lowpart*srcStride + srcWidth - 1] * 128;
+ }
+
+ int rightpart = w + (chrWidth);
+ xx = xpos2 >> 16;
+ xalpha = (xpos2 & 0xFFFF) >> 9;
+ dst[h * dstStride + rightpart] = (src[h *srcStride + xx] << 7) + (src[h * srcStride + xx + 1] - src[h * srcStride + xx]) * xalpha;
+ dst[lowpart * dstStride + rightpart] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;
+ inv_i = rightpart * xInc >> 16;
+ if( inv_i >= srcWidth - 1)
+ {
+ dst[h * dstStride + rightpart] = src[h * srcStride + srcWidth - 1] * 128;
+ dst[lowpart * dstStride + rightpart] = src[lowpart * srcStride + srcWidth - 1] * 128;
+ }
+
+ int xpos = 0;
+ xpos = chrXInc * w;
+ xx = xpos >> 16;
+ xalpha = (xpos & 0xFFFF) >> 9;
+ src += srcStride * srcHeight;
+ dst += dstStride * dstHeight;
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1] * xalpha);
+ inv_i = w * xInc >> 16;
+ if( inv_i >= (srcWidth >> 1) - 1)
+ {
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1]*128;
+ }
+
+ xpos = chrXInc * (w);
+ xx = xpos >> 16;
+ src += srcChrStride * srcHeight >> 1;
+ dst += (dstChrStride * chrHeight);
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1 ] * xalpha);
+
+ if( inv_i >= (srcWidth >> 1) - 1)
+ {
+ //v channel:
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1] * 128;
+ }
+ }
+ );
+
+char *kernel_src_vscalealldither = KERNEL (
+
+ kernel void vscale_all_dither_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global short *yfilter,
+ int yfilterSize,
+ const global short *cfilter,
+ int cfilterSize,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_dither_8x8_128[8][8] = {
+ { 36, 68, 60, 92, 34, 66, 58, 90, },
+ { 100, 4, 124, 28, 98, 2, 122, 26, },
+ { 52, 84, 44, 76, 50, 82, 42, 74, },
+ { 116, 20, 108, 12, 114, 18, 106, 10, },
+ { 32, 64, 56, 88, 38, 70, 62, 94, },
+ { 96, 0, 120, 24, 102, 6, 126, 30, },
+ { 48, 80, 40, 72, 54, 86, 46, 78, },
+ { 112, 16, 104, 8, 118, 22, 110, 14, },
+ };
+
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_dither_8x8_128[h & 7];
+ local_down_dither = hb_dither_8x8_128[(h + chrHeight) & 7];
+
+ //yscale;
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
+
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
+ int val3 = (local_down_dither[w &7] << 12);
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
+ int val5 = (local_up_dither[w & 7] << 12);
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
+
+ int j;
+ int filterPos1 = h * yfilterSize;
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;
+ for(j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];
+ srcPos1 += srcStride;
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];
+ srcPos2 += srcStride;
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];
+ srcPos3 += srcStride;
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];
+ srcPos4 += srcStride;
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];
+ src1Pos += dstChrStride;
+ src2Pos += dstChrStride;
+ }
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));
+
+ int dst1Pos = dstStride * dstHeight + h*(dstChrStride)+(w);
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
+ }
+ );
+
+char *kernel_src_vscaleallnodither = KERNEL (
+
+ kernel void vscale_all_nodither_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global short *yfilter,
+ int yfilterSize,
+ const global short *cfilter,
+ int cfilterSize,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_sws_pb_64[8] = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_sws_pb_64;
+ local_down_dither = hb_sws_pb_64;
+
+
+ //yscale;
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;
+
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);
+ int val3 = (local_down_dither[w &7] << 12);
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);
+ int val5 = (local_up_dither[w & 7] << 12);
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.
+
+
+ int j;
+ int filterPos1 = h * yfilterSize;
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;
+ for(j = 0; j < yfilterSize; j++)
+ {
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];
+ srcPos1 += srcStride;
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];
+ srcPos2 += srcStride;
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];
+ srcPos3 += srcStride;
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];
+ srcPos4 += srcStride;
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];
+ src1Pos += dstChrStride;
+ src2Pos += dstChrStride;
+ }
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));;
+
+ int dst1Pos = dstStride * dstHeight + h * (dstChrStride) + (w);
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));
+ }
+ );
+
+char *kernel_src_vscalefast = KERNEL (
+
+ kernel void vscale_fast_opencl (
+ global unsigned char *dst,
+ const global short *src,
+ const global int *yfilterPos,
+ const global int *cfilterPos,
+ int dstWidth,
+ int dstHeight,
+ int srcWidth,
+ int srcHeight,
+ int dstStride,
+ int dstChrStride,
+ int srcStride,
+ int srcChrStride)
+ {
+ const unsigned char hb_sws_pb_64[8] = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+ };
+
+ int w = get_global_id(0);
+ int h = get_global_id(1);
+
+ int chrWidth = get_global_size(0);
+ int chrHeight = get_global_size(1);
+
+ const unsigned char *local_up_dither;
+ const unsigned char *local_down_dither;
+
+ local_up_dither = hb_sws_pb_64;
+ local_down_dither = hb_sws_pb_64;
+
+
+ int rightpart = w + chrWidth;
+ int bh = h + chrHeight; // bottom part
+ short val1 = (src[(yfilterPos[h]) * dstStride + w] + local_up_dither[(w + 0) & 7]) >> 7; //lum offset is 0;
+ short val2 = (src[(yfilterPos[h]) * dstStride + rightpart] + local_up_dither[rightpart & 7]) >> 7;
+ short val3 = (src[(yfilterPos[bh]) * dstStride + w] + local_down_dither[w & 7]) >> 7;
+ short val4 = (src[(yfilterPos[bh]) * dstStride + rightpart] + local_down_dither[rightpart & 7]) >> 7;
+ dst[h * dstStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+ dst[h * dstStride + rightpart] = ((val2&(~0xFF)) ? ((-val2) >> 31) : (val2));
+ dst[bh * dstStride + w] = ((val3&(~0xFF)) ? ((-val3) >> 31) : (val3));
+ dst[bh * dstStride + rightpart] = ((val4&(~0xFF)) ? ((-val4) >> 31) : (val4));
+
+ src += dstStride * srcHeight;
+ dst += dstStride * dstHeight;
+ val1 = (src[cfilterPos[h] * (dstChrStride) + w] + local_up_dither[ w & 7]) >> 7;
+ dst[h * (dstChrStride) + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+
+ src += dstChrStride * (srcHeight >> 1);
+ dst += dstChrStride * chrHeight;
+ val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7;
+ dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));
+
+ } + ); + +char *kernel_src_scale = KERNEL ( + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst, + __global const uchar *src, + const float xscale, + const float yscale, + const int srcPlaneOffset0, + const int srcPlaneOffset1, + const int srcPlaneOffset2, + const int dstPlaneOffset0, + const int dstPlaneOffset1, + const int dstPlaneOffset2, + const int srcRowWords0, + const int srcRowWords1, + const int srcRowWords2, + const int dstRowWords0, + const int dstRowWords1, + const int dstRowWords2, + const int srcWidth, + const int srcHeight, + const int dstWidth, + const int dstHeight, + __global const float4* restrict xweights, + __global const float4* restrict yweights + ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Abort work items outside the dst image bounds. + + if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1)))) + return; + + const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2); + const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2); + const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2); + const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2); + + __local uchar pixels[64 * 36]; + const int localRowPixels = 64; + const int groupHeight = 16; // src pixel height output by the workgroup + const int ypad = 2; + const int localx = get_local_id(0); + + const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale); + const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad; + + float4 weights = xweights[x]; + int4 woffs = floor(x / xscale); + woffs += (int4)(-1, 0, 1, 2); + woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1); + const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1; + + // Scale x from global into LDS + + for (int i = 0; i <= globalRowCount; ++i) { + int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords; + offs += woffs; + pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights, + (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f)); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scale y from LDS into global + + if (x >= dstWidth >> ((z == 0) ? 0 : 1)) + return; + + int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords; + + for (int i = 0; i < groupHeight; ++i) { + if (y >= dstHeight >> ((z == 0) ? 0 : 1)) + break; + int localy = floor((get_group_id(1) * groupHeight + i) / yscale); + localy = localy - globalStartRow + ypad; + int loff = localx + localy * localRowPixels; + dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i], + (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels] + , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f)); + off += dstRowWords; + } +} +); + + +char *kernel_src_yadif_filter = KERNEL( + void filter_v6( + global unsigned char *dst, + global unsigned char *prev,
+ global unsigned char *cur,
+ global unsigned char *next,
+ int x,
+ int y,
+ int width,
+ int height,
+ int parity,
+ int inlinesize,
+ int outlinesize,
+ int inmode,
+ int uvflag
+ )
+ {
+
+ int flag = uvflag * (y >=height) * height;
+ int prefs = select(-(inlinesize), inlinesize,((y+1) - flag) <height);
+ int mrefs = select(inlinesize, -(inlinesize),y - flag);
+ int mode = select(inmode,2,(y - flag==1) || (y - flag + 2==height));
+ int score;
+
+ global unsigned char *prev2 = parity ? prev : cur ;
+ global unsigned char *next2 = parity ? cur : next;
+ int index = x + y * inlinesize;
+ int outindex = x + y * outlinesize;
+ int c = cur[index + mrefs];
+ int d = (prev2[index] + next2[index])>>1;
+ int e = cur[index + prefs];
+ int temporal_diff0 = abs((prev2[index]) - (next2[index]));
+ int temporal_diff1 =(abs(prev[index + mrefs] - c) + abs(prev[index + prefs] - e) )>>1;
+ int temporal_diff2 =(abs(next[index + mrefs] - c) + abs(next[index + prefs] - e) )>>1;
+ int diff = max(max(temporal_diff0>>1, temporal_diff1), temporal_diff2);
+ int spatial_pred = (c+e)>>1;
+ int spatial_score = abs(cur[index + mrefs-1] - cur[index + prefs-1]) + abs(c-e) + abs(cur[index + mrefs+1] - cur[index + prefs+1]) - 1;
+ //check -1
+ score = abs(cur[index + mrefs-2] - cur[index + prefs])
+ + abs(cur[index + mrefs-1] - cur[index + prefs+1])
+ + abs(cur[index + mrefs] - cur[index + prefs+2]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs-1] + cur[index + prefs+1])>>1;
+ }
+ //check -2
+ score = abs(cur[index + mrefs-3] - cur[index + prefs+1])
+ + abs(cur[index + mrefs-2] - cur[index + prefs+2])
+ + abs(cur[index + mrefs-1] - cur[index + prefs+3]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs-2] + cur[index + prefs+2])>>1;
+ }
+ //check 1
+ score = abs(cur[index + mrefs] - cur[index + prefs-2])
+ + abs(cur[index + mrefs+1] - cur[index + prefs-1])
+ + abs(cur[index + mrefs+2] - cur[index + prefs]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs+1] + cur[index + prefs-1])>>1;
+ }
+ //check 2
+ score = abs(cur[index + mrefs+1] - cur[index + prefs-3])
+ + abs(cur[index + mrefs+2] - cur[index + prefs-2])
+ + abs(cur[index + mrefs+3] - cur[index + prefs-1]);
+ if (score < spatial_score)
+ {
+ spatial_score= score;
+ spatial_pred= (cur[index + mrefs+2] + cur[index + prefs-2])>>1;
+ }
+ if (mode < 2)
+ {
+ int b = (prev2[index + (mrefs<<1)] + next2[index + (mrefs<<1)])>>1;
+ int f = (prev2[index + (prefs<<1)] + next2[index + (prefs<<1)])>>1;
+ int diffmax = max(max(d-e, d-c), min(b-c, f-e));
+ int diffmin = min(min(d-e, d-c), max(b-c, f-e));
+
+ diff = max(max(diff, diffmin), -diffmax);
+ }
+ if (spatial_pred > d + diff)
+ {
+ spatial_pred = d + diff;
+ }
+ else if (spatial_pred < d - diff)
+ {
+ spatial_pred = d - diff;
+ }
+
+ dst[outindex] = spatial_pred;
+ }
+
+ kernel void yadif_filter(
+ global unsigned char *dst,
+ global unsigned char *prev,
+ global unsigned char *cur,
+ global unsigned char *next,
+ int parity,
+ int inlinesizeY,
+ int inlinesizeUV,
+ int outlinesizeY,
+ int outlinesizeUV,
+ int mode)
+ {
+ int x=get_global_id(0);
+ int y=(get_global_id(1)<<1) + (!parity);
+ int width=(get_global_size(0)<<1)/3;
+ int height=get_global_size(1)<<1;
+
+
+ global unsigned char *dst_Y=dst;
+ global unsigned char *dst_U=dst_Y+height*outlinesizeY;
+
+ global unsigned char *prev_Y=prev;
+ global unsigned char *prev_U=prev_Y+height*inlinesizeY;
+
+ global unsigned char *cur_Y=cur;
+ global unsigned char *cur_U=cur_Y+height*inlinesizeY;
+
+ global unsigned char *next_Y=next;
+ global unsigned char *next_U=next_Y+height*inlinesizeY;
+
+ if(x < width)
+ {
+ filter_v6(dst_Y,prev_Y,cur_Y,next_Y,x,y,width,height,parity,inlinesizeY,outlinesizeY,mode,0);
+ }
+ else
+ {
+ x = x - width;
+ filter_v6(dst_U,prev_U,cur_U,next_U,x,y,width>>1,height>>1,parity,inlinesizeUV,outlinesizeUV,mode,1);
+ }
+ }
+ );
+
+#endif
diff --git a/libhb/openclwrapper.c b/libhb/openclwrapper.c new file mode 100644 index 000000000..9a7e9888d --- /dev/null +++ b/libhb/openclwrapper.c @@ -0,0 +1,1261 @@ +/* openclwrapper.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+ */
+
+#ifdef USE_OPENCL
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "openclwrapper.h"
+#include "openclkernels.h"
+
+//#define USE_EXTERNAL_KERNEL
+#ifdef SYS_MINGW
+#include <windows.h>
+#endif
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#if defined(_MSC_VER)
+#define strcasecmp strcmpi
+#endif
+
+#define MAX_KERNEL_STRING_LEN 64
+#define MAX_CLFILE_NUM 50
+#define MAX_CLKERNEL_NUM 200
+#define MAX_CLFILE_PATH 255
+#define MAX_KERNEL_NUM 50
+#define MAX_KERNEL_NAME_LEN 64
+
+#ifndef INVALID_HANDLE_VALUE
+#define INVALID_HANDLE_VALUE NULL
+#endif
+
+//#define THREAD_PRIORITY_TIME_CRITICAL 15
+
+enum VENDOR
+{
+ AMD = 0,
+ Intel,
+ NVIDIA,
+ others
+};
+typedef struct _GPUEnv
+{
+ //share vb in all modules in hb library
+ cl_platform_id platform;
+ cl_device_type dType;
+ cl_context context;
+ cl_device_id * devices;
+ cl_device_id dev;
+ cl_command_queue command_queue;
+ cl_kernel kernels[MAX_CLFILE_NUM];
+ cl_program programs[MAX_CLFILE_NUM]; //one program object maps one kernel source file
+ char kernelSrcFile[MAX_CLFILE_NUM][256]; //the max len of kernel file name is 256
+ int file_count; // only one kernel file
+
+ char kernel_names[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN+1];
+ cl_kernel_function kernel_functions[MAX_CLKERNEL_NUM];
+ int kernel_count;
+ int isUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
+ enum VENDOR vendor;
+}GPUEnv;
+
+typedef struct
+{
+ char kernelName[MAX_KERNEL_NAME_LEN+1];
+ char * kernelStr;
+}hb_kernel_node;
+ +static GPUEnv gpu_env; +static int isInited = 0; +static int useBuffers = 0; +static hb_kernel_node gKernels[MAX_KERNEL_NUM]; + +#define ADD_KERNEL_CFG( idx, s, p ){\ + strcpy( gKernels[idx].kernelName, s );\
+ gKernels[idx].kernelStr = p;\
+ strcpy( gpu_env.kernel_names[idx], s );\
+ gpu_env.kernel_count++; }
+
+
+/**
+ * hb_confirm_gpu_type
+ */
+int hb_confirm_gpu_type()
+{
+ int status = 1;
+ unsigned int i, j;
+ cl_uint numPlatforms = 0;
+ status = clGetPlatformIDs(0,NULL,&numPlatforms);
+ if(status != 0)
+ {
+ goto end;
+ }
+ if(numPlatforms > 0)
+ {
+ cl_platform_id* platforms = (cl_platform_id* )malloc (numPlatforms * sizeof(cl_platform_id));
+ status = clGetPlatformIDs (numPlatforms, platforms, NULL);
+ if (status != 0)
+ {
+ goto end;
+ }
+ for (i=0; i < numPlatforms; i++)
+ {
+ char pbuff[100];
+ cl_uint numDevices;
+ status = clGetPlatformInfo( platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof (pbuff),
+ pbuff,
+ NULL);
+ if (status)
+ continue;
+ status = clGetDeviceIDs( platforms[i],
+ CL_DEVICE_TYPE_GPU ,
+ 0 ,
+ NULL ,
+ &numDevices);
+
+ cl_device_id *devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
+ status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+ for (j = 0; j < numDevices; j++)
+ {
+ char dbuff[100];
+ status = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(dbuff), dbuff, NULL);
+ if (!strcmp(dbuff, "Advanced Micro Devices, Inc.") ||
+ !strcmp(dbuff, "Intel(R) Corporation") ||
+#ifdef __APPLE__
+ !strcmp(dbuff, "AMD") ||
+ /* MacBook Pro, AMD ATI Radeon HD 6750M, OS X 10.8.3 */
+ !strcmp(dbuff, "NVIDIA") ||
+ /* MacBook Pro, NVIDIA GeForce GT 330M, OS X 10.7.4 */
+#endif
+ !strcmp(dbuff, "NVIDIA Corporation"))
+ {
+ return 0;
+ }
+ }
+
+ if ( status != CL_SUCCESS )
+ continue;
+ if( numDevices )
+ break;
+ }
+ free( platforms );
+ }
+ end:
+ return -1;
+}
+
+/**
+ * hb_regist_opencl_kernel
+ */ +int hb_regist_opencl_kernel() +{ + //if( !gpu_env.isUserCreated ) + // memset( &gpu_env, 0, sizeof(gpu_env) ); + //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources. + + gpu_env.file_count = 0; //argc; + gpu_env.kernel_count = 0UL; + + ADD_KERNEL_CFG( 0, "frame_scale", NULL ) + ADD_KERNEL_CFG( 1, "yadif_filter", NULL ) + + return 0; +} +
+/**
+ * hb_regist_opencl_kernel
+ * @param filename -
+ * @param source -
+ * @param gpu_info -
+ * @param int idx -
+ */
+int hb_convert_to_string( const char *filename, char **source, GPUEnv *gpu_info, int idx )
+{
+ int file_size;
+ size_t result;
+ FILE * file = NULL;
+ file_size = 0;
+ result = 0;
+ file = fopen( filename, "rb+" );
+
+ if( file!=NULL )
+ {
+ fseek( file, 0, SEEK_END );
+
+ file_size = ftell( file );
+ rewind( file );
+ *source = (char*)malloc( sizeof(char) * file_size + 1 );
+ if( *source == (char*)NULL )
+ {
+ return(0);
+ }
+ result = fread( *source, 1, file_size, file );
+ if( result != file_size )
+ {
+ free( *source );
+ return(0);
+ }
+ (*source)[file_size] = '\0';
+ fclose( file );
+
+ return(1);
+ }
+ return(0);
+}
+
+/**
+ * hb_binary_generated
+ * @param context -
+ * @param cl_file_name -
+ * @param fhandle -
+ */
+int hb_binary_generated( cl_context context, const char * cl_file_name, FILE ** fhandle )
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ cl_device_id *devices;
+ char * str = NULL;
+ FILE * fd = NULL;
+
+ status = clGetContextInfo( context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Get context info failed" );
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ {
+ hb_log( "OpenCL: No device found" );
+ return 0;
+ }
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo( context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+
+ status = 0;
+ /* dump out each binary into its own separate file. */
+ for (i = 0; i < numDevices; i++)
+ {
+ char fileName[256] = { 0 };
+ char cl_name[128] = { 0 };
+ if (devices[i])
+ {
+ char deviceName[1024];
+ status = clGetDeviceInfo(devices[i],
+ CL_DEVICE_NAME,
+ sizeof(deviceName),
+ deviceName,
+ NULL);
+
+ str = (char*)strstr(cl_file_name, ".cl");
+ memcpy(cl_name, cl_file_name, str - cl_file_name);
+ cl_name[str - cl_file_name] = '\0';
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);
+ fd = fopen(fileName, "rb");
+ status = fd != NULL;
+ }
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+
+ if( fd != NULL )
+ *fhandle = fd;
+
+ return status;
+}
+
+/**
+ * hb_write_binary_to_file
+ * @param fileName -
+ * @param birary -
+ * @param numBytes -
+ */
+int hb_write_binary_to_file( const char* fileName, const char* birary, size_t numBytes )
+{
+ FILE *output = NULL;
+ output = fopen( fileName, "wb" );
+ if( output == NULL )
+ return 0;
+
+ fwrite( birary, sizeof(char), numBytes, output );
+ fclose( output );
+
+ return 1;
+}
+
+/**
+ * hb_generat_bin_from_kernel_source
+ * @param program -
+ * @param cl_file_name -
+ */
+int hb_generat_bin_from_kernel_source( cl_program program, const char * cl_file_name )
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ size_t *binarySizes;
+ cl_device_id *devices;
+ char **binaries;
+ char *str = NULL;
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_NUM_DEVICES failed");
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: no device found");
+ return 0;
+ }
+
+ /* grab the handles to all of the devices in the program. */
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_DEVICES failed");
+ return 0;
+ }
+
+ /* figure out the sizes of each of the binaries. */
+ binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_BINARY_SIZES,
+ sizeof(size_t) * numDevices,
+ binarySizes, NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARY_SIZES failed");
+ return 0;
+ }
+
+ /* copy over all of the generated binaries. */
+ binaries = (char**)malloc( sizeof(char *) * numDevices );
+ if( binaries == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries failed");
+ return 0;
+ }
+
+ for( i = 0; i < numDevices; i++ )
+ {
+ if( binarySizes[i] != 0 )
+ {
+ binaries[i] = (char*)malloc( sizeof(char) * binarySizes[i] );
+ if( binaries[i] == NULL )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries[%d] failed", i);
+ return 0;
+ }
+ }
+ else
+ {
+ binaries[i] = NULL;
+ }
+ }
+
+ status = clGetProgramInfo( program,
+ CL_PROGRAM_BINARIES,
+ sizeof(char *) * numDevices,
+ binaries,
+ NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARIES failed");
+ return 0;
+ }
+
+ /* dump out each binary into its own separate file. */
+ for (i = 0; i < numDevices; i++)
+ {
+ char fileName[256] = {0};
+ char cl_name[128] = {0};
+ if (binarySizes[i])
+ {
+ char deviceName[1024];
+ status = clGetDeviceInfo(devices[i],
+ CL_DEVICE_NAME,
+ sizeof(deviceName),
+ deviceName,
+ NULL);
+
+ str = (char*)strstr( cl_file_name, (char*)".cl" );
+ memcpy(cl_name, cl_file_name, str - cl_file_name);
+ cl_name[str - cl_file_name] = '\0';
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);
+
+ if (!hb_write_binary_to_file(fileName, binaries[i], binarySizes[i]))
+ {
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: unable to write kernel, writing to temporary directory instead.");
+ return 0;
+ }
+ }
+ }
+
+ // Release all resouces and memory
+ for( i = 0; i < numDevices; i++ )
+ {
+ if( binaries[i] != NULL )
+ {
+ free( binaries[i] );
+ binaries[i] = NULL;
+ }
+ }
+
+ if( binaries != NULL )
+ {
+ free( binaries );
+ binaries = NULL;
+ }
+
+ if( binarySizes != NULL )
+ {
+ free( binarySizes );
+ binarySizes = NULL;
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+ return 1;
+}
+
+
+/**
+ * hb_init_opencl_attr
+ * @param env -
+ */
+int hb_init_opencl_attr( OpenCLEnv * env )
+{
+ if( gpu_env.isUserCreated )
+ return 1;
+
+ gpu_env.context = env->context;
+ gpu_env.platform = env->platform;
+ gpu_env.dev = env->devices;
+ gpu_env.command_queue = env->command_queue;
+
+ gpu_env.isUserCreated = 1;
+
+ return 0;
+}
+
+/**
+ * hb_create_kernel
+ * @param kernelname -
+ * @param env -
+ */
+int hb_create_kernel( char * kernelname, KernelEnv * env )
+{
+ int status;
+ env->kernel = clCreateKernel( gpu_env.programs[0], kernelname, &status );
+ env->context = gpu_env.context;
+ env->command_queue = gpu_env.command_queue;
+ return status != CL_SUCCESS ? 1 : 0;
+}
+
+/**
+ * hb_release_kernel
+ * @param env -
+ */
+int hb_release_kernel( KernelEnv * env )
+{
+ int status = clReleaseKernel( env->kernel );
+ return status != CL_SUCCESS ? 1 : 0;
+}
+
+/**
+ * hb_init_opencl_env + * @param gpu_info - + */ + +static int init_once = 0; +int hb_init_opencl_env( GPUEnv *gpu_info ) +{ + size_t length; + cl_int status;
+ cl_uint numPlatforms, numDevices;
+ cl_platform_id *platforms;
+ cl_context_properties cps[3];
+ char platformName[100];
+ unsigned int i; + void *handle = INVALID_HANDLE_VALUE; + + + if (init_once != 0) + return 0; + else + init_once = 1; + /* + * Have a look at the available platforms. + */ + if( !gpu_info->isUserCreated )
+ {
+ status = clGetPlatformIDs( 0, NULL, &numPlatforms );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: OpenCL device platform not found." );
+ return(1);
+ }
+
+ gpu_info->platform = NULL;
+ if( 0 < numPlatforms )
+ {
+ platforms = (cl_platform_id*)malloc(
+ numPlatforms * sizeof(cl_platform_id));
+ if( platforms == (cl_platform_id*)NULL )
+ {
+ return(1);
+ }
+ status = clGetPlatformIDs( numPlatforms, platforms, NULL );
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Specific opencl platform not found." );
+ return(1);
+ }
+
+ for( i = 0; i < numPlatforms; i++ )
+ {
+ status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
+ sizeof(platformName), platformName,
+ NULL );
+
+ if( status != CL_SUCCESS )
+ {
+ continue;
+ }
+ gpu_info->platform = platforms[i];
+
+ if (!strcmp(platformName, "Advanced Micro Devices, Inc.") ||
+ !strcmp(platformName, "AMD"))
+ gpu_info->vendor = AMD;
+ else
+ gpu_info->vendor = others;
+
+ gpu_info->platform = platforms[i];
+
+ status = clGetDeviceIDs( gpu_info->platform /* platform */,
+ CL_DEVICE_TYPE_GPU /* device_type */,
+ 0 /* num_entries */,
+ NULL /* devices */,
+ &numDevices );
+
+ if( status != CL_SUCCESS )
+ {
+ continue;
+ }
+
+ if( numDevices )
+ break;
+
+ }
+ free( platforms );
+ }
+
+ if( NULL == gpu_info->platform )
+ {
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );
+ return(1);
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );
+ return(1);
+ }
+
+ /*
+ * Use available platform.
+ */
+ cps[0] = CL_CONTEXT_PLATFORM;
+ cps[1] = (cl_context_properties)gpu_info->platform;
+ cps[2] = 0;
+ /* Check for GPU. */
+ gpu_info->dType = CL_DEVICE_TYPE_GPU;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ gpu_info->dType = CL_DEVICE_TYPE_CPU;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+ }
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ gpu_info->dType = CL_DEVICE_TYPE_DEFAULT;
+ gpu_info->context = clCreateContextFromType(
+ cps, gpu_info->dType, NULL, NULL, &status );
+ }
+
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
+ {
+ hb_log( "OpenCL: Unable to create opencl context." );
+ return(1);
+ }
+
+ /* Detect OpenCL devices. */
+ /* First, get the size of device list data */
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES,
+ 0, NULL, &length );
+ if((status != CL_SUCCESS) || (length == 0))
+ {
+ hb_log( "OpenCL: Unable to get the list of devices in context." );
+ return(1);
+ }
+
+ /* Now allocate memory for device list based on the size we got earlier */
+ gpu_info->devices = (cl_device_id*)malloc( length );
+ if( gpu_info->devices == (cl_device_id*)NULL )
+ {
+ return(1);
+ }
+
+ /* Now, get the device list data */
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES, length,
+ gpu_info->devices, NULL );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to get the device list data in context." );
+ return(1);
+ }
+
+ /* Create OpenCL command queue. */
+ gpu_info->command_queue = clCreateCommandQueue( gpu_info->context,
+ gpu_info->devices[0],
+ 0, &status );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to create opencl command queue." );
+ return(1);
+ }
+ }
+
+ if( clGetCommandQueueInfo( gpu_info->command_queue,
+ CL_QUEUE_THREAD_HANDLE_AMD, sizeof(handle),
+ &handle, NULL ) == CL_SUCCESS && handle != INVALID_HANDLE_VALUE )
+ {
+#ifdef SYS_MINGW
+ SetThreadPriority( handle, THREAD_PRIORITY_TIME_CRITICAL );
+#endif
+ }
+
+ return 0;
+}
+
+
+/**
+ * hb_release_opencl_env
+ * @param gpu_info -
+ */
+int hb_release_opencl_env( GPUEnv *gpu_info )
+{
+ if( !isInited )
+ return 1;
+ int i;
+
+ for( i = 0; i<gpu_env.file_count; i++ )
+ {
+ if( gpu_env.programs[i] ) ;
+ {
+ clReleaseProgram( gpu_env.programs[i] );
+ gpu_env.programs[i] = NULL;
+ }
+ }
+
+ if( gpu_env.command_queue )
+ {
+ clReleaseCommandQueue( gpu_env.command_queue );
+ gpu_env.command_queue = NULL;
+ }
+
+ if( gpu_env.context )
+ {
+ clReleaseContext( gpu_env.context );
+ gpu_env.context = NULL;
+ }
+
+ isInited = 0;
+ gpu_info->isUserCreated = 0;
+ return 1;
+}
+
+
+/**
+ * hb_register_kernel_wrapper
+ * @param kernel_name -
+ * @param function -
+ */
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function )
+{
+ int i;
+ for( i = 0; i < gpu_env.kernel_count; i++ )
+ {
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
+ {
+ gpu_env.kernel_functions[i] = function;
+ return(1);
+ }
+ }
+ return(0);
+}
+
+/**
+ * hb_cached_of_kerner_prg
+ * @param gpu_env -
+ * @param cl_file_name -
+ */
+int hb_cached_of_kerner_prg( const GPUEnv *gpu_env, const char * cl_file_name )
+{
+ int i;
+ for( i = 0; i < gpu_env->file_count; i++ )
+ {
+ if( strcasecmp( gpu_env->kernelSrcFile[i], cl_file_name ) == 0 )
+ {
+ if( gpu_env->programs[i] != NULL )
+ return(1);
+ }
+ }
+
+ return(0);
+}
+
+/**
+ * hb_compile_kernel_file
+ * @param filename -
+ * @param gpu_info -
+ * @param indx -
+ * @param build_option -
+ */
+int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,
+ int indx, const char *build_option )
+{
+ cl_int status;
+ size_t length;
+ char *source_str;
+ const char *source;
+ size_t source_size[1];
+ char *buildLog = NULL;
+ int b_error, binary_status, binaryExisted;
+ char * binary;
+ cl_uint numDevices;
+ cl_device_id *devices;
+ FILE * fd;
+ FILE * fd1;
+ int idx;
+
+ if( hb_cached_of_kerner_prg( gpu_info, filename ) == 1 )
+ return (1);
+
+ idx = gpu_info->file_count;
+
+#ifdef USE_EXTERNAL_KERNEL
+ status = hb_convert_to_string( filename, &source_str, gpu_info, idx );
+ if( status == 0 ) + return(0); +#else + int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter); + +// char *scale_src; +// status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx); +// if (status != 0) +// kernel_src_size += strlen(scale_src); + + source_str = (char*)malloc( kernel_src_size + 2 ); + strcpy( source_str, kernel_src_scale ); +// strcat( source_str, scale_src ); // + strcat( source_str, kernel_src_yadif_filter ); +#endif + + source = source_str;
+ source_size[0] = strlen( source );
+
+ if ((binaryExisted = hb_binary_generated(gpu_info->context, filename, &fd)) == 1)
+ {
+ status = clGetContextInfo(gpu_info->context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL);
+ if (status != CL_SUCCESS)
+ {
+ hb_log("OpenCL: Unable to get the number of devices in context.");
+ return 0;
+ }
+
+ devices = (cl_device_id*)malloc(sizeof(cl_device_id) * numDevices);
+ if (devices == NULL)
+ return 0;
+
+ length = 0;
+ b_error = 0;
+ b_error |= fseek(fd, 0, SEEK_END) < 0;
+ b_error |= (length = ftell(fd)) <= 0;
+ b_error |= fseek(fd, 0, SEEK_SET) < 0;
+ if (b_error)
+ return 0;
+
+ binary = (char*)calloc(length + 2, sizeof(char));
+ if (binary == NULL)
+ return 0;
+
+ b_error |= fread(binary, 1, length, fd) != length;
+#if 0 // this doesn't work under OS X and/or with some non-AMD GPUs
+ if (binary[length-1] != '\n')
+ binary[length++] = '\n;
+#endif
+
+ if (b_error)
+ return 0;
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo(gpu_info->context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL);
+
+ gpu_info->programs[idx] = clCreateProgramWithBinary(gpu_info->context,
+ numDevices,
+ devices,
+ &length,
+ (const unsigned char**)&binary,
+ &binary_status,
+ &status);
+
+ fclose(fd);
+ free(devices);
+ fd = NULL;
+ devices = NULL;
+ }
+ else
+ {
+ /* create a CL program using the kernel source */
+ gpu_info->programs[idx] = clCreateProgramWithSource(
+ gpu_info->context, 1, &source, source_size, &status );
+ }
+
+ if((gpu_info->programs[idx] == (cl_program)NULL) || (status != CL_SUCCESS)){
+ hb_log( "OpenCL: Unable to get list of devices in context." );
+ return(0);
+ }
+
+ /* create a cl program executable for all the devices specified */
+ if( !gpu_info->isUserCreated )
+ {
+ status = clBuildProgram( gpu_info->programs[idx], 1, gpu_info->devices,
+ build_option, NULL, NULL );
+ }
+ else
+ {
+ status = clBuildProgram( gpu_info->programs[idx], 1, &(gpu_info->dev),
+ build_option, NULL, NULL );
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ if( !gpu_info->isUserCreated )
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],
+ gpu_info->devices[0],
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+ }
+ else
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],
+ gpu_info->dev,
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+ }
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: Unable to get GPU build information." );
+ return(0);
+ }
+
+ buildLog = (char*)malloc( length );
+ if( buildLog == (char*)NULL )
+ {
+ return(0);
+ }
+
+ if( !gpu_info->isUserCreated )
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->devices[0],
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+ }
+ else
+ {
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->dev,
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+ }
+
+ fd1 = fopen( "kernel-build.log", "w+" );
+ if( fd1 != NULL ) {
+ fwrite( buildLog, sizeof(char), length, fd1 );
+ fclose( fd1 );
+ }
+
+ free( buildLog );
+ return(0);
+ }
+
+ strcpy( gpu_env.kernelSrcFile[idx], filename );
+ + if (binaryExisted != 1) + { + //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename); + } + + gpu_info->file_count += 1; +
+ return(1);
+}
+
+
+/**
+ * hb_get_kernel_env_and_func
+ * @param kernel_name -
+ * @param env -
+ * @param function -
+ */
+int hb_get_kernel_env_and_func( const char *kernel_name,
+ KernelEnv *env,
+ cl_kernel_function *function )
+{
+ int i;
+ for( i = 0; i < gpu_env.kernel_count; i++ )
+ {
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
+ {
+ env->context = gpu_env.context;
+ env->command_queue = gpu_env.command_queue;
+ env->program = gpu_env.programs[0];
+ env->kernel = gpu_env.kernels[i];
+ env->isAMD = ( gpu_env.vendor == AMD ) ? 1 : 0;
+ *function = gpu_env.kernel_functions[i];
+ return(1);
+ }
+ }
+ return(0);
+}
+
+/**
+ * hb_get_kernel_env_and_func
+ * @param kernel_name -
+ * @param userdata -
+ */
+int hb_run_kernel( const char *kernel_name, void **userdata )
+{
+ KernelEnv env;
+ cl_kernel_function function;
+ int status;
+ memset( &env, 0, sizeof(KernelEnv));
+ status = hb_get_kernel_env_and_func( kernel_name, &env, &function );
+ strcpy( env.kernel_name, kernel_name );
+ if( status == 1 )
+ {
+ return(function( userdata, &env ));
+ }
+
+ return(0);
+}
+
+/**
+ * hb_init_opencl_run_env
+ * @param argc -
+ * @param argv -
+ * @param build_option -
+ */
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option )
+{
+ int status = 0;
+ if( MAX_CLKERNEL_NUM <= 0 )
+ {
+ return 1;
+ }
+
+ if((argc > MAX_CLFILE_NUM) || (argc<0))
+ {
+ return 1;
+ }
+
+ if( !isInited )
+ {
+ hb_regist_opencl_kernel();
+
+ /*initialize devices, context, comand_queue*/
+ status = hb_init_opencl_env( &gpu_env );
+ if( status )
+ return(1);
+
+ /*initialize program, kernel_name, kernel_count*/
+ status = hb_compile_kernel_file("hb-opencl-kernels.cl",
+ &gpu_env, 0, build_option);
+
+ if( status == 0 || gpu_env.kernel_count == 0 )
+ {
+ return(1);
+ + } + + useBuffers = 1; + isInited = 1; + } + + return(0);
+}
+
+/**
+ * hb_release_opencl_run_env
+ */
+int hb_release_opencl_run_env()
+{
+ return hb_release_opencl_env( &gpu_env );
+}
+
+/**
+ * hb_opencl_stats
+ */
+int hb_opencl_stats()
+{
+ return isInited;
+}
+
+/**
+ * hb_get_opencl_env
+ */
+int hb_get_opencl_env()
+{
+ int i = 0;
+ cl_int status;
+ cl_uint numDevices;
+ cl_device_id *devices;
+
+ /*initialize devices, context, comand_queue*/
+ status = hb_init_opencl_env( &gpu_env );
+ if( status )
+ return(1);
+ status = clGetContextInfo( gpu_env.context,
+ CL_CONTEXT_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if( status != CL_SUCCESS )
+ return 0;
+
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
+ if( devices == NULL )
+ return 0;
+
+ /* grab the handles to all of the devices in the context. */
+ status = clGetContextInfo( gpu_env.context,
+ CL_CONTEXT_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+
+ for (i = 0; i < numDevices; i++)
+ {
+ if (devices[i] != NULL)
+ {
+ char deviceVendor[100], deviceName[1024], driverVersion[1024];
+ clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(deviceVendor),
+ deviceVendor, NULL);
+ clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName),
+ deviceName, NULL);
+ clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(driverVersion),
+ driverVersion, NULL);
+ hb_log("hb_get_opencl_env: GPU #%d, Device Vendor: %s", i + 1, deviceVendor);
+ hb_log("hb_get_opencl_env: GPU #%d, Device Name: %s", i + 1, deviceName);
+ hb_log("hb_get_opencl_env: GPU #%d, Driver Version: %s", i + 1, driverVersion);
+ }
+ }
+
+ if( devices != NULL )
+ {
+ free( devices );
+ devices = NULL;
+ }
+
+ return status;
+}
+
+/**
+ * hb_create_buffer
+ * @param cl_inBuf -
+ * @param flags -
+ * @param size -
+ */
+int hb_create_buffer( cl_mem *cl_Buf, int flags, int size )
+{
+ int status;
+ *cl_Buf = clCreateBuffer( gpu_env.context, (flags), (size), NULL, &status );
+
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: clCreateBuffer error '%d'", status );
+ return 0;
+ }
+
+ return 1;
+}
+
+
+/**
+ * hb_read_opencl_buffer
+ * @param cl_inBuf -
+ * @param outbuf -
+ * @param size -
+ */
+int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size )
+{
+ int status;
+
+ status = clEnqueueReadBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE, 0, size, outbuf, 0, 0, 0 );
+ if( status != CL_SUCCESS )
+ {
+ hb_log( "OpenCL: av_read_opencl_buffer error '%d'", status );
+ return 0;
+ }
+
+ return 1; +} + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size) +{ + int status; + int flags = CL_MEM_ALLOC_HOST_PTR; + //cl_event event; + *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status); + *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status); + + //hb_log("\t **** context: %.8x cmdqueue: %.8x cl_mem: %.8x mapaddr: %.8x size: %d status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status); + + return (status == CL_SUCCESS) ? 1 : 0; +} + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr) +{ + cl_event event; + int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event); + if (status == CL_SUCCESS) + clWaitForEvents(1, &event); + else + hb_log("hb_free_mapped_buffer: error %d", status); + return (status == CL_SUCCESS) ? 1 : 0; +} + +void hb_opencl_init() +{ + hb_get_opencl_env(); +} + +int hb_use_buffers() +{ + return useBuffers; +} + +int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb) +{ + int status = clEnqueueCopyBuffer(gpu_env.command_queue, + src_buffer,
+ dst_buffer,
+ src_offset, dst_offset, cb,
+ 0, 0, 0);
+ if( status != CL_SUCCESS )
+ {
+ av_log(NULL,AV_LOG_ERROR, "hb_read_opencl_buffer error '%d'\n", status );
+ return 0;
+ }
+ return 1;
+}
+
+int hb_read_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height)
+{
+
+ int chrH = -(-height >> 1);
+ unsigned char *temp = (unsigned char *)av_malloc(sizeof(uint8_t) * (linesize0 * height + linesize1 * chrH * 2));
+ if(hb_read_opencl_buffer(cl_inBuf,temp,sizeof(uint8_t)*(linesize0 + linesize1)*height))
+ {
+ memcpy(Ybuf,temp,linesize0 * height);
+ memcpy(Ubuf,temp + linesize0 * height,linesize1 *chrH);
+ memcpy(Vbuf,temp + linesize0 * height + linesize1 * chrH,linesize2 * chrH);
+
+ }
+ av_free(temp);
+
+ return 1;
+}
+
+int hb_write_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height,int offset)
+{
+ int status;
+ void *mapped = clEnqueueMapBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * (linesize0 + linesize1)*height + offset, 0, NULL, NULL, NULL );
+ uint8_t *temp = (uint8_t *)mapped;
+ temp += offset;
+ memcpy(temp,Ybuf,sizeof(uint8_t) * linesize0 * height);
+ memcpy(temp + sizeof(uint8_t) * linesize0 * height,Ubuf,sizeof(uint8_t) * linesize1 * height/2);
+ memcpy(temp + sizeof(uint8_t) * (linesize0 * height + linesize1 * height/2),Vbuf,sizeof(uint8_t) * linesize2 * height/2);
+ clEnqueueUnmapMemObject(gpu_env.command_queue, cl_inBuf, mapped, 0, NULL, NULL );
+ return 1;
+}
+
+cl_command_queue hb_get_command_queue()
+{
+ return gpu_env.command_queue;
+}
+
+cl_context hb_get_context()
+{
+ return gpu_env.context;
+}
+#endif
diff --git a/libhb/openclwrapper.h b/libhb/openclwrapper.h new file mode 100644 index 000000000..8436c3a9c --- /dev/null +++ b/libhb/openclwrapper.h @@ -0,0 +1,89 @@ +/* openclwrapper.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+#ifndef __OPENCL_WRAPPER_H
+#define __OPENCL_WRAPPER_H
+#ifdef USE_OPENCL
+#include "common.h"
+
+//support AMD opencl
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
+
+typedef struct _KernelEnv
+{
+ cl_context context;
+ cl_command_queue command_queue;
+ cl_program program;
+ cl_kernel kernel;
+ char kernel_name[150];
+ int isAMD;
+}KernelEnv;
+
+typedef struct _OpenCLEnv
+{
+ cl_platform_id platform;
+ cl_context context;
+ cl_device_id devices;
+ cl_command_queue command_queue;
+}OpenCLEnv;
+
+
+//user defined, this is function wrapper which is used to set the input parameters ,
+//luanch kernel and copy data from GPU to CPU or CPU to GPU.
+typedef int (*cl_kernel_function)( void **userdata, KernelEnv *kenv );
+
+// registe a wapper for running the kernel specified by the kernel name
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function );
+
+// run kernel , user call this function to luanch kernel.
+// kernel_name: this kernel name is used to find the kernel in opencl runtime environment
+// userdata: this userdata is the all parameters for running the kernel specified by kernel name
+int hb_run_kernel( const char *kernel_name, void **userdata );
+
+// init the run time environment , this function must be called befor calling any function related to opencl
+// the argc must be set zero , argv must be set NULL, build_option is the options for build the kernel.
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option );
+
+//relase all resource about the opencl , this function must be called after calling any functions related to opencl
+int hb_release_opencl_run_env();
+
+// get the opencl status , 0: not init ; 1, inited; this function is used the check whether or not the opencl run time has been created
+int hb_opencl_stats();
+
+// update opencl run time environments , such as commandqueue , platforme, context. program
+int hb_init_opencl_attr( OpenCLEnv * env );
+
+// create kernel object by a kernel name on the specified opencl run time indicated by env parameter
+int hb_create_kernel( char * kernelname, KernelEnv * env );
+
+// release kernel object which is generated by calling the hb_create_kernel api +int hb_release_kernel( KernelEnv * env ); + +void hb_opencl_init(); + +int hb_get_opencl_env(); + +int hb_create_buffer(cl_mem *cl_Buf,int flags,int size); + +int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size); + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size); + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr); + +int hb_use_buffers(); + +int hb_confirm_gpu_type(); +#endif +#endif diff --git a/libhb/scale.c b/libhb/scale.c new file mode 100644 index 000000000..9dcff28ac --- /dev/null +++ b/libhb/scale.c @@ -0,0 +1,1020 @@ +/* scale.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifdef USE_OPENCL
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hb.h"
+#include "scale.h"
+#include "scale_kernel.h"
+#include "libavutil/pixdesc.h"
+
+#define isScaleRGBinInt(x) \
+ ( \
+ (x)==AV_PIX_FMT_RGB48BE || \
+ (x)==AV_PIX_FMT_RGB48LE || \
+ (x)==AV_PIX_FMT_RGB32 || \
+ (x)==AV_PIX_FMT_RGB32_1 || \
+ (x)==AV_PIX_FMT_RGB24 || \
+ (x)==AV_PIX_FMT_RGB565BE || \
+ (x)==AV_PIX_FMT_RGB565LE || \
+ (x)==AV_PIX_FMT_RGB555BE || \
+ (x)==AV_PIX_FMT_RGB555LE || \
+ (x)==AV_PIX_FMT_RGB444BE || \
+ (x)==AV_PIX_FMT_RGB444LE || \
+ (x)==AV_PIX_FMT_RGB8 || \
+ (x)==AV_PIX_FMT_RGB4 || \
+ (x)==AV_PIX_FMT_RGB4_BYTE || \
+ (x)==AV_PIX_FMT_MONOBLACK || \
+ (x)==AV_PIX_FMT_MONOWHITE \
+ )
+#define isScaleBGRinInt(x) \
+ ( \
+ (x)==AV_PIX_FMT_BGR48BE || \
+ (x)==AV_PIX_FMT_BGR48LE || \
+ (x)==AV_PIX_FMT_BGR32 || \
+ (x)==AV_PIX_FMT_BGR32_1 || \
+ (x)==AV_PIX_FMT_BGR24 || \
+ (x)==AV_PIX_FMT_BGR565BE || \
+ (x)==AV_PIX_FMT_BGR565LE || \
+ (x)==AV_PIX_FMT_BGR555BE || \
+ (x)==AV_PIX_FMT_BGR555LE || \
+ (x)==AV_PIX_FMT_BGR444BE || \
+ (x)==AV_PIX_FMT_BGR444LE || \
+ (x)==AV_PIX_FMT_BGR8 || \
+ (x)==AV_PIX_FMT_BGR4 || \
+ (x)==AV_PIX_FMT_BGR4_BYTE|| \
+ (x)==AV_PIX_FMT_MONOBLACK|| \
+ (x)==AV_PIX_FMT_MONOWHITE \
+ )
+
+#define isScaleAnyRGB(x) \
+ ( \
+ isScaleRGBinInt(x) || \
+ isScaleBGRinInt(x) \
+ )
+
+#define isScaleGray(x) \
+ ((x) == AV_PIX_FMT_GRAY8 || \
+ (x) == AV_PIX_FMT_Y400A || \
+ (x) == AV_PIX_FMT_GRAY16BE || \
+ (x) == AV_PIX_FMT_GRAY16LE)
+
+static ScaleContext *g_scale;
+
+static double getScaleSplineCoeff(double a, double b, double c, double d, double dist)
+{
+ if (dist <= 1.0)
+ return ((d * dist + c) * dist + b) * dist + a;
+ else
+ return getScaleSplineCoeff(0.0,
+ b + 2.0 * c + 3.0 * d,
+ c + 3.0 * d,
+ -b - 3.0 * c - 6.0 * d,
+ dist - 1.0);
+}
+
+static int initScaleFilter(int16_t **outFilter, int32_t **filterPos,
+ int *outFilterSize, int xInc, int srcW, int dstW,
+ int filterAlign, int one, int flags, int cpu_flags,
+ ScaleVector *srcFilter, ScaleVector *dstFilter,
+ double param[2])
+{
+ int i;
+ int filterSize;
+ int filter2Size;
+ int minFilterSize;
+ int64_t *filter = NULL;
+ int64_t *filter2 = NULL;
+ const int64_t fone = 1LL << 54;
+ int ret = -1;
+
+ *filterPos = (int32_t *)av_malloc((dstW + 3) * sizeof(**filterPos));
+ if (*filterPos == NULL && ((dstW + 3) * sizeof(**filterPos)) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ if (FFABS(xInc - 0x10000) < 10)
+ { // unscaled
+ int i;
+ filterSize = 1;
+ filter = (int64_t *)av_mallocz(dstW * sizeof(*filter) * filterSize);
+ if (filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ filter[i * filterSize] = fone;
+ (*filterPos)[i] = i;
+ }
+ }
+ else if (flags & SWS_POINT)
+ { // lame looking point sampling mode
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 1;
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+
+ (*filterPos)[i] = xx;
+ filter[i] = fone;
+ xDstInSrc += xInc;
+ }
+ }
+ else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
+ (flags & SWS_FAST_BILINEAR))
+ { // bilinear upscale
+ int i;
+ int64_t xDstInSrc;
+ filterSize = 2;
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc / 2 - 0x8000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
+ int j;
+
+ (*filterPos)[i] = xx;
+ // bilinear upscale / linear interpolate / area averaging
+ for (j = 0; j < filterSize; j++)
+ {
+ int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
+ if (coeff < 0)
+ coeff = 0;
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += xInc;
+ }
+ }
+ else
+ {
+ int64_t xDstInSrc;
+ int sizeFactor;
+
+ if (flags & SWS_BICUBIC)
+ sizeFactor = 4;
+ else if (flags & SWS_X)
+ sizeFactor = 8;
+ else if (flags & SWS_AREA)
+ sizeFactor = 1; // downscale only, for upscale it is bilinear
+ else if (flags & SWS_GAUSS)
+ sizeFactor = 8; // infinite ;)
+ else if (flags & SWS_LANCZOS)
+ sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;
+ else if (flags & SWS_SINC)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_SPLINE)
+ sizeFactor = 20; // infinite ;)
+ else if (flags & SWS_BILINEAR)
+ sizeFactor = 2;
+ else
+ {
+ sizeFactor = 0; // GCC warning killer
+ assert(0);
+ }
+
+ if (xInc <= 1 << 16)
+ filterSize = 1 + sizeFactor; // upscale
+ else
+ filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
+
+
+ filterSize = FFMIN(filterSize, srcW - 2);
+ filterSize = FFMAX(filterSize, 1);
+
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)
+ {
+ hb_log("Cannot allocate memory.");
+ goto fail;
+ }
+
+ xDstInSrc = xInc - 0x10000;
+ for (i = 0; i < dstW; i++)
+ {
+ int xx = (xDstInSrc - ((filterSize - 2) << 16)) / (1 << 17);
+ int j;
+ (*filterPos)[i] = xx;
+ for (j = 0; j < filterSize; j++)
+ {
+ int64_t d = (FFABS(((int64_t)xx << 17) - xDstInSrc)) << 13;
+ double floatd;
+ int64_t coeff;
+
+ if (xInc > 1 << 16)
+ d = d * dstW / srcW;
+ floatd = d * (1.0 / (1 << 30));
+
+ if (flags & SWS_BICUBIC)
+ {
+ int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);
+ int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);
+
+ if (d >= 1LL << 31)
+ {
+ coeff = 0.0;
+ }
+ else
+ {
+ int64_t dd = (d * d) >> 30;
+ int64_t ddd = (dd * d) >> 30;
+
+ if (d < 1LL << 30)
+ coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +
+ (-18 * (1 << 24) + 12 * B + 6 * C) * dd +
+ (6 * (1 << 24) - 2 * B) * (1 << 30);
+ else
+ coeff = (-B - 6 * C) * ddd +
+ (6 * B + 30 * C) * dd +
+ (-12 * B - 48 * C) * d +
+ (8 * B + 24 * C) * (1 << 30);
+ }
+ coeff *= fone >> (30 + 24);
+ }
+#if 0
+ else if (flags & SWS_X)
+ {
+ double p = param ? param * 0.01 : 0.3;
+ coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;
+ coeff *= pow (2.0, -p * d * d);
+ }
+#endif
+ else if (flags & SWS_X)
+ {
+ double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
+ double c;
+
+ if (floatd < 1.0)
+ c = cos(floatd * M_PI);
+ else
+ c = -1.0;
+ if (c < 0.0)
+ c = -pow(-c, A);
+ else
+ c = pow(c, A);
+ coeff = (c * 0.5 + 0.5) * fone;
+ }
+ else if (flags & SWS_AREA)
+ {
+ int64_t d2 = d - (1 << 29);
+ if (d2 * xInc < -(1LL << (29 + 16)))
+ coeff = 1.0 * (1LL << (30 + 16));
+ else if (d2 * xInc < (1LL << (29 + 16)))
+ coeff = -d2 * xInc + (1LL << (29 + 16));
+ else
+ coeff = 0.0;
+ coeff *= fone >> (30 + 16);
+ }
+ else if (flags & SWS_GAUSS)
+ {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (pow(2.0, -p * floatd * floatd)) * fone;
+ }
+ else if (flags & SWS_SINC)
+ {
+ coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;
+ }
+ else if (flags & SWS_LANCZOS)
+ {
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
+ coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /
+ (floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;
+ if (floatd > p)
+ coeff = 0;
+ }
+ else if (flags & SWS_BILINEAR)
+ {
+ coeff = (1 << 30) - d;
+ if (coeff < 0)
+ coeff = 0;
+ coeff *= fone >> 30;
+ }
+ else if (flags & SWS_SPLINE)
+ {
+ double p = -2.196152422706632;
+ coeff = getScaleSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
+ }
+ else
+ {
+ coeff = 0.0; // GCC warning killer
+ assert(0);
+ }
+
+ filter[i * filterSize + j] = coeff;
+ xx++;
+ }
+ xDstInSrc += 2 * xInc;
+ }
+ }
+
+ assert(filterSize > 0);
+ filter2Size = filterSize;
+ if (srcFilter)
+ filter2Size += srcFilter->length - 1;
+ if (dstFilter)
+ filter2Size += dstFilter->length - 1;
+ assert(filter2Size > 0);
+ filter2 = (int64_t *)av_mallocz(filter2Size * dstW * sizeof(*filter2));
+ if (filter2 == NULL && (filter2Size * dstW * sizeof(*filter2)) != 0)
+ {
+ hb_log("Can't alloc memory.");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j, k;
+
+ if (srcFilter)
+ {
+ for (k = 0; k < srcFilter->length; k++)
+ {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + k + j] +=
+ srcFilter->coeff[k] * filter[i * filterSize + j];
+ }
+ }
+ else
+ {
+ for (j = 0; j < filterSize; j++)
+ filter2[i * filter2Size + j] = filter[i * filterSize + j];
+ }
+ // FIXME dstFilter
+
+ (*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;
+ }
+ av_freep(&filter);
+
+ // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
+ minFilterSize = 0;
+ for (i = dstW - 1; i >= 0; i--)
+ {
+ int min = filter2Size;
+ int j;
+ int64_t cutOff = 0.0;
+
+ for (j = 0; j < filter2Size; j++)
+ {
+ int k;
+ cutOff += FFABS(filter2[i * filter2Size]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+
+ if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
+ break;
+
+ // move filter coefficients left
+ for (k = 1; k < filter2Size; k++)
+ filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
+ filter2[i * filter2Size + k - 1] = 0;
+ (*filterPos)[i]++;
+ }
+
+ cutOff = 0;
+ for (j = filter2Size - 1; j > 0; j--)
+ {
+ cutOff += FFABS(filter2[i * filter2Size + j]);
+
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
+ break;
+ min--;
+ }
+
+ if (min > minFilterSize)
+ minFilterSize = min;
+ }
+
+
+ assert(minFilterSize > 0);
+ filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
+ assert(filterSize > 0);
+ filter = (int64_t *)av_malloc(filterSize * dstW * sizeof(*filter));
+ if (filterSize >= MAX_FILTER_SIZE * 16 /
+ ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
+ goto fail;
+ *outFilterSize = filterSize;
+
+ if (flags & SWS_PRINT_INFO)
+ hb_log("SwScaler: reducing / aligning filtersize %d -> %d",filter2Size,filterSize);
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ if (j >= filter2Size)
+ filter[i * filterSize + j] = 0;
+ else
+ filter[i * filterSize + j] = filter2[i * filter2Size + j];
+ if ((flags & SWS_BITEXACT) && j >= minFilterSize)
+ filter[i * filterSize + j] = 0;
+ }
+ }
+
+ // FIXME try to align filterPos if possible
+
+ // fix borders
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ if ((*filterPos)[i] < 0)
+ {
+ // move filter coefficients left to compensate for filterPos
+ for (j = 1; j < filterSize; j++)
+ {
+ int left = FFMAX(j + (*filterPos)[i], 0);
+ filter[i * filterSize + left] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= 0;
+ }
+
+ if ((*filterPos)[i] + filterSize > srcW)
+ {
+ int shift = (*filterPos)[i] + filterSize - srcW;
+ // move filter coefficients right to compensate for filterPos
+ for (j = filterSize - 2; j >= 0; j--)
+ {
+ int right = FFMIN(j + shift, filterSize - 1);
+ filter[i * filterSize + right] += filter[i * filterSize + j];
+ filter[i * filterSize + j] = 0;
+ }
+ (*filterPos)[i]= srcW - filterSize;
+ }
+ }
+
+ // Note the +1 is for the MMX scaler which reads over the end
+ // FF_ALLOCZ_OR_GOTO(NULL, *outFilter,
+ // *outFilterSize * (dstW + 3) * sizeof(int16_t), fail);
+ *outFilter = (int16_t *)av_mallocz(*outFilterSize * (dstW + 3) * sizeof(int16_t));
+ if( *outFilter == NULL && (*outFilterSize * (dstW + 3) * sizeof(int16_t)) != 0)
+ {
+ hb_log("Can't alloc memory");
+ goto fail;
+ }
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int64_t error = 0;
+ int64_t sum = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ sum += filter[i * filterSize + j];
+ }
+ sum = (sum + one / 2) / one;
+ for (j = 0; j < *outFilterSize; j++)
+ {
+ int64_t v = filter[i * filterSize + j] + error;
+ int intV = ROUNDED_DIV(v, sum);
+ (*outFilter)[i * (*outFilterSize) + j] = intV;
+ error = v - intV * sum;
+ }
+ }
+
+ (*filterPos)[dstW + 0] =
+ (*filterPos)[dstW + 1] =
+ (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1];
+ for (i = 0; i < *outFilterSize; i++)
+ {
+ int k = (dstW - 1) * (*outFilterSize) + i;
+ (*outFilter)[k + 1 * (*outFilterSize)] =
+ (*outFilter)[k + 2 * (*outFilterSize)] =
+ (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
+ }
+
+ ret = 0;
+
+fail:
+ av_free(filter);
+ av_free(filter2);
+ return ret;
+}
+
+static int handle_scale_jpeg(enum PixelFormat *format)
+{
+ switch (*format)
+ {
+ case AV_PIX_FMT_YUVJ420P:
+ *format = AV_PIX_FMT_YUV420P;
+ return 1;
+ case AV_PIX_FMT_YUVJ422P:
+ *format = AV_PIX_FMT_YUV422P;
+ return 1;
+ case AV_PIX_FMT_YUVJ444P:
+ *format = AV_PIX_FMT_YUV444P;
+ return 1;
+ case AV_PIX_FMT_YUVJ440P:
+ *format = AV_PIX_FMT_YUV440P;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static void scaleGetSubSampleFactors(int *h, int *v, enum PixelFormat format)
+{
+ *h = av_pix_fmt_descriptors[format].log2_chroma_w;
+ *v = av_pix_fmt_descriptors[format].log2_chroma_h;
+}
+
+typedef struct FormatEntry {
+ int is_supported_in, is_supported_out;
+} FormatEntry;
+
+static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 0 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 },
+ { 0, 1 }, { 1, 1 }, { 1, 1 }, { 0, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 0 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 0, 0 },
+ { 0, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
+ { 1, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 },
+ { 1, 0 }, { 1, 0 },
+};
+
+int scale_isSupportedInput( enum PixelFormat pix_fmt )
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_in : 0;
+}
+
+int scale_isSupportedOutput( enum PixelFormat pix_fmt )
+{
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
+ format_entries[pix_fmt].is_supported_out : 0;
+}
+
+static void hcscale_fast_c( ScaleContext *c, int16_t *dst1, int16_t *dst2,
+ int dstWidth, const uint8_t *src1,
+ const uint8_t *src2, int srcW, int xInc )
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++)
+ {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
+ dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ {
+ dst1[i] = src1[srcW-1]*128;
+ dst2[i] = src2[srcW-1]*128;
+ }
+}
+
+static void hyscale_fast_c(ScaleContext *c, int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc)
+{
+ int i;
+ unsigned int xpos = 0;
+ for (i = 0; i < dstWidth; i++)
+ {
+ register unsigned int xx = xpos >> 16;
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
+ dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
+ xpos += xInc;
+ }
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+ dst[i] = src[srcW-1]*128;
+}
+
+static void hScale16To19_c(ScaleContext *c, int16_t *_dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ const uint16_t *src = (const uint16_t *) _src;
+ int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+ int sh = bits - 4;
+
+ if((isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8)
+ && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
+ sh= 9;
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
+ }
+}
+
+static void hScale16To15_c(ScaleContext *c, int16_t *dst, int dstW,
+ const uint8_t *_src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ const uint16_t *src = (const uint16_t *) _src;
+ int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
+ if(sh<15)
+ sh= isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8
+ ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
+
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+
+ for (j = 0; j < filterSize; j++)
+ {
+ val += src[srcPos + j] * filter[filterSize * i + j];
+ }
+ // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
+ }
+}
+
+static void hScale8To15_c(ScaleContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++)
+ {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
+ }
+}
+
+static void hScale8To19_c(ScaleContext *c, int16_t *_dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < dstW; i++)
+ {
+ int j;
+ int srcPos = filterPos[i];
+ int val = 0;
+ for (j = 0; j < filterSize; j++)
+ {
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+ }
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
+ }
+}
+
+static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
+}
+
+static void lumRangeFromJpeg_c(int16_t *dst, int width)
+{
+ int i;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i] * 14071 + 33561947) >> 14;
+}
+
+static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
+ }
+}
+
+static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+{
+ int i;
+ int32_t *dstU = (int32_t *) _dstU;
+ int32_t *dstV = (int32_t *) _dstV;
+ for (i = 0; i < width; i++)
+ {
+ dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+ }
+}
+
+static void lumRangeToJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12;
+}
+
+static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
+{
+ int i;
+ int32_t *dst = (int32_t *) _dst;
+ for (i = 0; i < width; i++)
+ dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
+}
+
+static av_cold void sws_init_swScale_c(ScaleContext *c)
+{
+ enum PixelFormat srcFormat = c->srcFormat;
+
+ ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX);
+
+ ff_sws_init_input_funcs(c);
+
+ if (c->srcBpc == 8)
+ {
+ if (c->dstBpc <= 10)
+ {
+ c->hyScale = c->hcScale = hScale8To15_c;
+ if (c->flags & SWS_FAST_BILINEAR)
+ {
+ c->hyscale_fast = hyscale_fast_c;
+ c->hcscale_fast = hcscale_fast_c;
+ }
+ }
+ else
+ {
+ c->hyScale = c->hcScale = hScale8To19_c;
+ }
+ }
+ else
+ {
+ c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c
+ : hScale16To15_c;
+ }
+
+ if (c->srcRange != c->dstRange && !isScaleAnyRGB(c->dstFormat))
+ {
+ if (c->dstBpc <= 10)
+ {
+ if (c->srcRange)
+ {
+ c->lumConvertRange = lumRangeFromJpeg_c;
+ c->chrConvertRange = chrRangeFromJpeg_c;
+ }
+ else
+ {
+ c->lumConvertRange = lumRangeToJpeg_c;
+ c->chrConvertRange = chrRangeToJpeg_c;
+ }
+ }
+ else
+ {
+ if (c->srcRange)
+ {
+ c->lumConvertRange = lumRangeFromJpeg16_c;
+ c->chrConvertRange = chrRangeFromJpeg16_c;
+ }
+ else
+ {
+ c->lumConvertRange = lumRangeToJpeg16_c;
+ c->chrConvertRange = chrRangeToJpeg16_c;
+ }
+ }
+ }
+
+ if (!(isScaleGray(srcFormat) || isScaleGray(c->dstFormat) ||
+ srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
+ c->needs_hcscale = 1;
+}
+
+int scale_init_context(ScaleContext *c, ScaleFilter *srcFilter, ScaleFilter *dstFilter)
+{
+ ScaleFilter dummyFilter = { NULL, NULL, NULL, NULL };
+ int srcW = c->srcW;
+ int srcH = c->srcH;
+ int dstW = c->dstW;
+ int dstH = c->dstH;
+ int flags, cpu_flags;
+ enum PixelFormat srcFormat = c->srcFormat;
+ enum PixelFormat dstFormat = c->dstFormat;
+
+ cpu_flags = 0;
+ flags = c->flags;
+
+ if(srcFormat != c->srcFormat || dstFormat != c->dstFormat)
+ {
+ hb_log("deprecated pixel format used, make sure you did set range correctly.");
+ c->srcFormat = srcFormat;
+ c->dstFormat = dstFormat;
+ }
+
+ if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1)
+ {
+ hb_log("%dx%d -> %dx%d is invalid scaling dimension.",srcW,srcH,dstW,dstH);
+ return -1;
+ }
+
+ if (!dstFilter)
+ dstFilter = &dummyFilter;
+ if (!srcFilter)
+ srcFilter = &dummyFilter;
+
+ c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+ c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+ c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[dstFormat]);
+ c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[srcFormat]);
+ c->vRounder = 4 * 0x0001000100010001ULL;
+
+ scaleGetSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
+ scaleGetSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
+
+ // drop some chroma lines if the user wants it
+ c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >> SWS_SRC_V_CHR_DROP_SHIFT;
+ c->chrSrcVSubSample += c->vChrDrop;
+ c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample);
+ c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample);
+ c->chrDstW = -((-dstW) >> c->chrDstHSubSample);
+ c->chrDstH = -((-dstH) >> c->chrDstVSubSample);
+ c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;
+ c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;
+
+ const int filterAlign = 1;
+
+ if (initScaleFilter(&c->hLumFilter, &c->hLumFilterPos,
+ &c->hLumFilterSize, c->lumXInc,
+ srcW, dstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumH, dstFilter->lumH,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->hChrFilter, &c->hChrFilterPos,
+ &c->hChrFilterSize, c->chrXInc,
+ c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrH, dstFilter->chrH,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,
+ c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
+ cpu_flags, srcFilter->lumV, dstFilter->lumV,
+ c->param) < 0)
+ goto fail;
+
+ if (initScaleFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
+ c->chrYInc, c->chrSrcH, c->chrDstH,
+ filterAlign, (1 << 12),
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
+ cpu_flags, srcFilter->chrV, dstFilter->chrV,
+ c->param) < 0)
+ goto fail;
+ return 0;
+fail:
+ return -1;
+}
+
+ScaleContext *scale_getContext(int srcW, int srcH, enum PixelFormat srcFormat,
+ int dstW, int dstH, enum PixelFormat dstFormat,
+ int flags, ScaleFilter *srcFilter,
+ ScaleFilter *dstFilter, const double *param)
+{
+ ScaleContext *sc = (ScaleContext*)malloc(sizeof(ScaleContext));
+ sc->flags = flags;
+ sc->srcW = srcW;
+ sc->srcH = srcH;
+ sc->dstW = dstW;
+ sc->dstH = dstH;
+ sc->srcRange = handle_scale_jpeg(&srcFormat);
+ sc->dstRange = handle_scale_jpeg(&dstFormat);
+ sc->srcFormat = srcFormat;
+ sc->dstFormat = dstFormat;
+ sc->hyscale_fast = 0;
+ sc->hcscale_fast = 0;
+
+ if (param)
+ {
+ sc->param[0] = param[0];
+ sc->param[1] = param[1];
+ }
+
+ if (scale_init_context(sc, srcFilter, dstFilter) < 0)
+ {
+ sws_freeContext(sc);
+ return NULL;
+ }
+ return sc;
+}
+
+int scale_opencl(ScaleContext *c, void *cl_inbuf, void *cl_outbuf, int *srcStride, int *dstStride)
+{
+ int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
+ av_scale_frame(c,cl_outbuf,cl_inbuf,srcStride,dstStride,&should_dither);
+ return 1;
+}
+
+void scale_init( int width, int height, int dstwidth, int dstheight )
+{
+ int srcW = width;
+ int srcH = height;
+ int dstW = dstwidth;
+ int dstH = dstheight;
+ enum PixelFormat inputfmt = AV_PIX_FMT_YUV420P;
+ enum PixelFormat outputfmt = AV_PIX_FMT_YUV420P;
+ int flags = SWS_BILINEAR;
+ g_scale = scale_getContext(srcW,srcH,inputfmt,dstW,dstH,outputfmt,flags,NULL,NULL,NULL);
+}
+
+void scale_release()
+{
+ sws_freeContext( g_scale );
+}
+#ifdef USE_OPENCL
+int scale_run( cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height )
+{
+ g_scale->cl_src = inbuf;
+ g_scale->cl_dst = outbuf;
+
+ int src_stride[4] = { linesizey, linesizeuv, linesizeuv, 0 };
+ int dst_stride[4] = { g_scale->dstW, g_scale->chrDstW, g_scale->chrDstW, 0 };
+ int ret = -1;
+
+ ret = scale_opencl( g_scale, inbuf, outbuf, src_stride, dst_stride );
+
+ return ret;
+}
+#endif
+#endif
diff --git a/libhb/scale.h b/libhb/scale.h new file mode 100644 index 000000000..46c177efc --- /dev/null +++ b/libhb/scale.h @@ -0,0 +1,324 @@ +/* scale.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifndef SCALE_H
+#define SCALE_H
+#ifdef USE_OPENCL
+#include <stdint.h>
+#include "vadxva2.h"
+#include "libavutil/pixfmt.h"
+#include "hbffmpeg.h"
+
+#define YUVRGB_TABLE_HEADROOM 128
+#define MAX_FILTER_SIZE 256
+#define is16BPS(x) \
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 15)
+
+#define is9_OR_10BPS(x) \
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 8 || \
+ av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 9)
+
+#if ARCH_X86_64
+# define APCK_PTR2 8
+# define APCK_COEF 16
+# define APCK_SIZE 24
+#else
+# define APCK_PTR2 4
+# define APCK_COEF 8
+# define APCK_SIZE 16
+#endif
+
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+typedef void (*yuv2interleavedX_fn)(struct ScaleContext *c,
+ const int16_t *chrFilter,
+ int chrFilterSize,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc,
+ uint8_t *dest, int dstW);
+
+typedef void (*yuv2packed1_fn)(struct ScaleContext *c, const int16_t *lumSrc,
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc, uint8_t *dest,
+ int dstW, int uvalpha, int y);
+
+typedef void (*yuv2packed2_fn)(struct SCaleContext *c, const int16_t *lumSrc[2],
+ const int16_t *chrUSrc[2],
+ const int16_t *chrVSrc[2],
+ const int16_t *alpSrc[2],
+ uint8_t *dest,
+ int dstW, int yalpha, int uvalpha, int y);
+
+typedef void (*yuv2packedX_fn)(struct SCaleContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter,
+ const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest,
+ int dstW, int y);
+
+typedef int (*SwsFunc)(struct ScaleContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]);
+
+typedef struct {
+ double *coeff; ///< pointer to the list of coefficients
+ int length; ///< number of coefficients in the vector
+} ScaleVector;
+
+typedef struct {
+ ScaleVector *lumH;
+ ScaleVector *lumV;
+ ScaleVector *chrH;
+ ScaleVector *chrV;
+} ScaleFilter;
+
+typedef struct ScaleContext {
+ SwsFunc swScale;
+ int srcW; ///< Width of source luma/alpha planes.
+ int srcH; ///< Height of source luma/alpha planes.
+ int dstH; ///< Height of destination luma/alpha planes.
+ int chrSrcW; ///< Width of source chroma planes.
+ int chrSrcH; ///< Height of source chroma planes.
+ int chrDstW; ///< Width of destination chroma planes.
+ int chrDstH; ///< Height of destination chroma planes.
+ int lumXInc, chrXInc;
+ int lumYInc, chrYInc;
+ enum PixelFormat dstFormat; ///< Destination pixel format.
+ enum PixelFormat srcFormat; ///< Source pixel format.
+ int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.
+ int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.
+ int dstBpc, srcBpc;
+ int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.
+ int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.
+ int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
+ int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.
+ int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
+ int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
+ double param[2]; ///< Input parameters for scaling algorithms that need them.
+
+ uint32_t pal_yuv[256];
+ uint32_t pal_rgb[256];
+
+ int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
+ int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
+ int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
+ int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
+ int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
+ int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
+ int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.
+ int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
+ int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.
+
+ uint8_t *formatConvBuffer;
+ int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.
+ int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
+ int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
+ int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
+ int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
+ int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
+ int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
+ int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
+ int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
+ int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
+ int vChrFilterSize; ///< Vertical filter size for chroma pixels.
+
+ int lumMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for luma/alpha planes.
+ int chrMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for chroma planes.
+ uint8_t *lumMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for luma/alpha planes.
+ uint8_t *chrMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for chroma planes.
+
+ int canMMX2BeUsed;
+
+ unsigned char *dest;
+ unsigned char *source;
+
+ int dstY; ///< Last destination vertical line output from last slice.
+ int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
+ void *yuvTable; ///<s pointer to the yuv->rgb table start so it can be freed()
+ uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
+ int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
+ uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
+
+ //Colorspace stuff
+ int contrast, brightness, saturation; // for sws_getColorspaceDetails
+ int srcColorspaceTable[4];
+ int dstColorspaceTable[4];
+ int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).
+ int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
+ int src0Alpha;
+ int dst0Alpha;
+ int yuv2rgb_y_offset;
+ int yuv2rgb_y_coeff;
+ int yuv2rgb_v2r_coeff;
+ int yuv2rgb_v2g_coeff;
+ int yuv2rgb_u2g_coeff;
+ int yuv2rgb_u2b_coeff;
+
+#define RED_DITHER "0*8"
+#define GREEN_DITHER "1*8"
+#define BLUE_DITHER "2*8"
+#define Y_COEFF "3*8"
+#define VR_COEFF "4*8"
+#define UB_COEFF "5*8"
+#define VG_COEFF "6*8"
+#define UG_COEFF "7*8"
+#define Y_OFFSET "8*8"
+#define U_OFFSET "9*8"
+#define V_OFFSET "10*8"
+#define LUM_MMX_FILTER_OFFSET "11*8"
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
+#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
+#define ESP_OFFSET "11*8+4*4*256*2+8"
+#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
+#define U_TEMP "11*8+4*4*256*2+24"
+#define V_TEMP "11*8+4*4*256*2+32"
+#define Y_TEMP "11*8+4*4*256*2+40"
+#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
+#define UV_OFF_PX "11*8+4*4*256*3+48"
+#define UV_OFF_BYTE "11*8+4*4*256*3+56"
+#define DITHER16 "11*8+4*4*256*3+64"
+#define DITHER32 "11*8+4*4*256*3+80"
+
+ DECLARE_ALIGNED(8, uint64_t, redDither);
+ DECLARE_ALIGNED(8, uint64_t, greenDither);
+ DECLARE_ALIGNED(8, uint64_t, blueDither);
+
+ DECLARE_ALIGNED(8, uint64_t, yCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vrCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ubCoeff);
+ DECLARE_ALIGNED(8, uint64_t, vgCoeff);
+ DECLARE_ALIGNED(8, uint64_t, ugCoeff);
+ DECLARE_ALIGNED(8, uint64_t, yOffset);
+ DECLARE_ALIGNED(8, uint64_t, uOffset);
+ DECLARE_ALIGNED(8, uint64_t, vOffset);
+ int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
+ int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
+ int dstW; ///< Width of destination luma/alpha planes.
+ DECLARE_ALIGNED(8, uint64_t, esp);
+ DECLARE_ALIGNED(8, uint64_t, vRounder);
+ DECLARE_ALIGNED(8, uint64_t, u_temp);
+ DECLARE_ALIGNED(8, uint64_t, v_temp);
+ DECLARE_ALIGNED(8, uint64_t, y_temp);
+ int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
+
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
+ DECLARE_ALIGNED(8, uint16_t, dither16)[8];
+ DECLARE_ALIGNED(8, uint32_t, dither32)[8];
+
+ const uint8_t *chrDither8, *lumDither8;
+
+#if HAVE_ALTIVEC
+ vector signed short CY;
+ vector signed short CRV;
+ vector signed short CBU;
+ vector signed short CGU;
+ vector signed short CGV;
+ vector signed short OY;
+ vector unsigned short CSHIFT;
+ vector signed short *vYCoeffsBank, *vCCoeffsBank;
+#endif
+
+#if ARCH_BFIN
+ DECLARE_ALIGNED(4, uint32_t, oy);
+ DECLARE_ALIGNED(4, uint32_t, oc);
+ DECLARE_ALIGNED(4, uint32_t, zero);
+ DECLARE_ALIGNED(4, uint32_t, cy);
+ DECLARE_ALIGNED(4, uint32_t, crv);
+ DECLARE_ALIGNED(4, uint32_t, rmask);
+ DECLARE_ALIGNED(4, uint32_t, cbu);
+ DECLARE_ALIGNED(4, uint32_t, bmask);
+ DECLARE_ALIGNED(4, uint32_t, cgu);
+ DECLARE_ALIGNED(4, uint32_t, cgv);
+ DECLARE_ALIGNED(4, uint32_t, gmask);
+#endif
+
+#if HAVE_VIS
+ DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
+#endif
+ int use_mmx_vfilter;
+
+ /* function pointers for swScale() */
+ yuv2planar1_fn yuv2plane1;
+ yuv2planarX_fn yuv2planeX;
+ yuv2interleavedX_fn yuv2nv12cX;
+ yuv2packed1_fn yuv2packed1;
+ yuv2packed2_fn yuv2packed2;
+ yuv2packedX_fn yuv2packedX;
+
+ /// Unscaled conversion of luma plane to YV12 for horizontal scaler.
+ void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
+ void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+ /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
+ void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal);
+
+ void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width);
+ void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
+ int width);
+
+ void (*hyscale_fast)(struct SwsContext *c,
+ int16_t *dst, int dstWidth,
+ const uint8_t *src, int srcW, int xInc);
+ void (*hcscale_fast)(struct SwsContext *c,
+ int16_t *dst1, int16_t *dst2, int dstWidth,
+ const uint8_t *src1, const uint8_t *src2,
+ int srcW, int xInc);
+
+ void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+ void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+ void (*lumConvertRange)(int16_t *dst, int width);
+ void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
+
+ int needs_hcscale; ///< Set if there are chroma planes to be converted.
+
+ cl_mem cl_hLumFilter;
+ cl_mem cl_hLumFilterPos;
+ cl_mem cl_hChrFilter;
+ cl_mem cl_hChrFilterPos;
+ cl_mem cl_vLumFilter;
+ cl_mem cl_vLumFilterPos;
+ cl_mem cl_vChrFilter;
+ cl_mem cl_vChrFilterPos;
+
+ cl_mem cl_intermediaBuf;
+
+ cl_mem cl_src;
+ cl_mem cl_dst;
+} ScaleContext;
+
+void scale_init(int, int, int, int);
+void scale_release();
+int scale_run(cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height);
+#endif
+#endif
diff --git a/libhb/scale_kernel.c b/libhb/scale_kernel.c new file mode 100644 index 000000000..ddcfed914 --- /dev/null +++ b/libhb/scale_kernel.c @@ -0,0 +1,223 @@ +/* scale_kernel.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/>
+ Li Cao <[email protected]> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifdef USE_OPENCL
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include "scale.h"
+#include "openclwrapper.h"
+
+#define OCLCHECK( method, ...) \
+ status = method(__VA_ARGS__); if(status != CL_SUCCESS) { \
+ hb_log(" error %s %d",# method, status); assert(0); return status; }
+
+#define CREATEBUF( out, flags, size, ptr)\
+ out = clCreateBuffer( kenv->context, (flags), (size), ptr, &status );\
+ if( status != CL_SUCCESS ) { hb_log( "clCreateBuffer faild %d", status ); return -1; }
+
+ #define CL_PARAM_NUM 20
+
+/****************************************************************************************************************************/
+/*************************Combine the hscale and yuv2plane into scaling******************************************************/
+/****************************************************************************************************************************/
+static int CreateCLBuffer( ScaleContext *c, KernelEnv *kenv )
+{
+ cl_int status;
+
+ if(!c->hyscale_fast || !c->hcscale_fast)
+ {
+ CREATEBUF(c->cl_hLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*c->hLumFilterSize*sizeof(cl_short), c->hLumFilter);
+ CREATEBUF(c->cl_hLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*sizeof(cl_int), c->hLumFilterPos);
+ CREATEBUF(c->cl_hChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*c->hChrFilterSize*sizeof(cl_short), c->hChrFilter);
+ CREATEBUF(c->cl_hChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*sizeof(cl_int), c->hChrFilterPos);
+ }
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ {
+ CREATEBUF(c->cl_vLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*c->vLumFilterSize*sizeof(cl_short), c->vLumFilter);
+ CREATEBUF(c->cl_vChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*c->vChrFilterSize*sizeof(cl_short), c->vChrFilter);
+ }
+ CREATEBUF(c->cl_vLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*sizeof(cl_int), c->vLumFilterPos);
+ CREATEBUF(c->cl_vChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*sizeof(cl_int), c->vChrFilterPos);
+
+ return 1;
+}
+
+int av_scale_frame_func( void **userdata, KernelEnv *kenv )
+{
+ ScaleContext *c = (ScaleContext *)userdata[0];
+
+ c->cl_src = (cl_mem)userdata[2];
+ c->cl_dst = (cl_mem)userdata[1];
+
+ /*frame size*/
+ int *tmp = (int *)userdata[3];
+ int srcStride = tmp[0];
+ int srcChrStride = tmp[1];
+ int srcW = c->srcW;
+ int srcH = c->srcH;
+
+ tmp = (int *)userdata[4];
+ int dstStride = tmp[0];
+ int dstChrStride = tmp[1];
+ int dstW = c->dstW;
+ int dstH = c->dstH;
+
+ /* local variable */
+ cl_int status;
+ size_t global_work_size[2];
+
+ int intermediaSize;
+
+ int st = CreateCLBuffer(c,kenv);
+ if( !st )
+ {
+ hb_log( "CreateBuffer[%s] faild %d", "scale_opencl",st );
+ return -1;
+ }
+
+ intermediaSize = dstStride * srcH + dstChrStride * srcH;
+
+ CREATEBUF(c->cl_intermediaBuf, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, intermediaSize*sizeof(cl_short), NULL);
+
+ static int init_chr_status = 0;
+ static cl_kernel chr_kernel;
+
+ if(init_chr_status == 0){
+
+ if(!(c->flags & 1))
+ {
+ chr_kernel = clCreateKernel( kenv->program, "hscale_all_opencl", NULL );
+ //Set the Kernel Argument;
+ OCLCHECK(clSetKernelArg,chr_kernel, 2, sizeof(cl_mem), (void*)&c->cl_hLumFilter);
+ OCLCHECK(clSetKernelArg,chr_kernel, 3, sizeof(cl_mem), (void*)&c->cl_hLumFilterPos);
+ OCLCHECK(clSetKernelArg,chr_kernel, 4, sizeof(int), (void*)&c->hLumFilterSize);
+ OCLCHECK(clSetKernelArg,chr_kernel, 5, sizeof(cl_mem), (void*)&c->cl_hChrFilter);
+ OCLCHECK(clSetKernelArg,chr_kernel, 6, sizeof(cl_mem), (void*)&c->cl_hChrFilterPos);
+ OCLCHECK(clSetKernelArg,chr_kernel, 7, sizeof(int), (void*)&c->hChrFilterSize);
+ }
+
+ /*Set the arguments*/
+ OCLCHECK(clSetKernelArg, chr_kernel, 8, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, chr_kernel, 9, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, chr_kernel, 10, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, chr_kernel, 11, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, chr_kernel, 12, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 14, sizeof(srcStride), (void*)&srcStride);
+ OCLCHECK(clSetKernelArg, chr_kernel, 15, sizeof(srcChrStride), (void*)&srcChrStride);
+ init_chr_status = 1;
+ }
+
+ kenv->kernel = chr_kernel;
+ OCLCHECK(clSetKernelArg, chr_kernel, 0, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);
+ OCLCHECK(clSetKernelArg, chr_kernel, 1, sizeof(cl_mem), (void*)&c->cl_src);
+ /*Run the Kernel*/
+ global_work_size[0] = c->chrDstW;//dstW >> 1; //must times 256;
+ global_work_size[1] = c->chrSrcH;
+
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+ static int init_lum_status = 0;
+ static cl_kernel lum_kernel;
+
+ if( init_lum_status == 0 ){
+ //Vertical:
+ /*Create Kernel*/
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ lum_kernel = clCreateKernel( kenv->program, "vscale_all_nodither_opencl", NULL );
+ else
+ lum_kernel = clCreateKernel( kenv->program, "vscale_fast_opencl", NULL );
+
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )
+ {
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilter);
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(int), (void*)&c->vLumFilterSize);
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(cl_mem), (void*)&c->cl_vChrFilter);
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(int), (void*)&c->vChrFilterSize);
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstH), (void*)&dstH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 12, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 14, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 15, sizeof(dstChrStride), (void*)&dstChrStride);
+ }
+ else
+ {
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(dstW), (void*)&dstW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(dstH), (void*)&dstH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(srcW), (void*)&srcW);
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(srcH), (void*)&srcH);
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstChrStride), (void*)&dstChrStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(dstStride), (void*)&dstStride);
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(dstChrStride), (void*)&dstChrStride);
+ }
+ init_lum_status = 1;
+ }
+
+ kenv->kernel = lum_kernel;
+ OCLCHECK(clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), (void*)&c->cl_dst);
+ OCLCHECK(clSetKernelArg, kenv->kernel, 1, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);
+
+ /*Run the Kernel*/
+ global_work_size[0] = c->chrDstW;
+ global_work_size[1] = c->chrDstH;
+
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL,global_work_size, NULL, 0, NULL, NULL);
+
+ clReleaseMemObject( c->cl_intermediaBuf );
+
+ return 1;
+}
+
+void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither)
+{
+
+ static int regflg = 0;
+ void *userdata[CL_PARAM_NUM];
+ userdata[0] = (void *)c;
+ userdata[1] = (void *)dst;
+ userdata[2] = (void *)src;
+ userdata[3] = (void *)srcStride;
+ userdata[4] = (void *)dstStride;
+ userdata[5] = (void *)should_dither;
+
+ if( regflg==0 )
+ {
+ int st = hb_register_kernel_wrapper( "scale_opencl", av_scale_frame_func);
+ if( !st )
+ {
+ hb_log( "register kernel[%s] faild %d", "scale_opencl",st );
+ return;
+ }
+ regflg++;
+ }
+
+ if( !hb_run_kernel( "scale_opencl", userdata ))
+ {
+ hb_log("run kernel function[%s] faild", "scale_opencl_func" );
+ return;
+ }
+}
+
+#endif
diff --git a/libhb/scale_kernel.h b/libhb/scale_kernel.h new file mode 100644 index 000000000..ce413382f --- /dev/null +++ b/libhb/scale_kernel.h @@ -0,0 +1,20 @@ +/* scale_kernel.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + + */ + +#ifndef _H_SCALE_KERNEL_H +#define _H_SCALE_KERNEL_H +#ifdef USE_OPENCL +void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither); +#endif +#endif diff --git a/libhb/stream.c b/libhb/stream.c index 994cb70de..3512b2d93 100644 --- a/libhb/stream.c +++ b/libhb/stream.c @@ -16,6 +16,7 @@ #include "lang.h" #include "a52dec/a52.h" #include "libbluray/bluray.h" +#include "vadxva2.h" #define min(a, b) a < b ? a : b #define HB_MAX_PROBE_SIZE (1*1024*1024) @@ -49,7 +50,7 @@ typedef struct { static const stream2codec_t st2codec[256] = { st(0x00, U, 0, 0, NULL), st(0x01, V, WORK_DECMPEG2, 0, "MPEG1"), - st(0x02, V, WORK_DECMPEG2, 0, "MPEG2"), + st(0x02, V, WORK_DECMPEG2, AV_CODEC_ID_MPEG2VIDEO, "MPEG2"), st(0x03, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG1"), st(0x04, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG2"), st(0x05, N, 0, 0, "ISO 13818-1 private section"), @@ -609,6 +610,10 @@ static int hb_stream_get_type(hb_stream_t *stream) if ( fread(buf, 1, sizeof(buf), stream->file_handle) == sizeof(buf) ) { +#ifdef USE_HWD + if ( hb_gui_use_hwd_flag == 1 ) + return 0; +#endif int psize; if ( ( psize = hb_stream_check_for_ts(buf) ) != 0 ) { @@ -1096,7 +1101,28 @@ hb_title_t * hb_stream_title_scan(hb_stream_t *stream, hb_title_t * title) { hb_log( "transport stream missing PCRs - using video DTS instead" ); } - +#ifdef USE_HWD + hb_va_dxva2_t * dxva2 = NULL; + dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param ); + if ( dxva2 ) + { + title->hwd_support = 1; + hb_va_close(dxva2); + dxva2 = NULL; + } + else + title->hwd_support = 0; +#else + title->hwd_support = 0; +#endif +#ifdef USE_OPENCL + if ( hb_confirm_gpu_type() == 0 ) + title->opencl_support = 1; + else + title->opencl_support = 0; +#else + title->opencl_support = 0; +#endif // Height, width, rate and aspect ratio information is filled in // when the previews are built return title; @@ -5513,6 +5539,7 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) title->demuxer = HB_NULL_DEMUXER; title->video_codec = 0; int i; + int pix_fmt = -1; for (i = 0; i < ic->nb_streams; ++i ) { if ( ic->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO && @@ -5520,6 +5547,7 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) title->video_codec == 0 ) { AVCodecContext *context = ic->streams[i]->codec; + pix_fmt = context->pix_fmt; if ( context->pix_fmt != AV_PIX_FMT_YUV420P && !sws_isSupportedInput( context->pix_fmt ) ) { @@ -5626,6 +5654,30 @@ static hb_title_t *ffmpeg_title_scan( hb_stream_t *stream, hb_title_t *title ) chapter->seconds = title->seconds; hb_list_add( title->list_chapter, chapter ); } +#ifdef USE_HWD + hb_va_dxva2_t * dxva2 = NULL; + dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param ); + if (dxva2) + { + title->hwd_support = 1; + hb_va_close(dxva2); + dxva2 = NULL; + } + else + title->hwd_support = 0; + if ( hb_check_hwd_fmt(pix_fmt) == 0) + title->hwd_support = 0; +#else + title->hwd_support = 0; +#endif +#ifdef USE_OPENCL + if (hb_confirm_gpu_type() == 0) + title->opencl_support = 1; + else + title->opencl_support = 0; +#else + title->opencl_support = 0; +#endif return title; } diff --git a/libhb/vadxva2.c b/libhb/vadxva2.c new file mode 100644 index 000000000..4137b63c6 --- /dev/null +++ b/libhb/vadxva2.c @@ -0,0 +1,812 @@ +/* vadxva2.c + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ +#include "vadxva2.h" + +#ifdef USE_OPENCL +#if defined(__APPLE__) +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#include "oclnv12toyuv.h" +#include "scale.h" +#endif + +#ifdef USE_HWD + +static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height ); +static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame ); +static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 ); +static void hb_d3d_destroy_device( hb_va_dxva2_t *dxvva2 ); +static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 ); +static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 ); +static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 ); +static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 ); +static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output ); +static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt ); +static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 ); +static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format ); +static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid ); +static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 ); +/** + * It destroys a Direct3D device manager + */ +static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->devmng ) + IDirect3DDeviceManager9_Release( dxva2->devmng ); +} +/** + * It releases a Direct3D device and its resources. + */ +static void hb_d3d_destroy_device( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->d3ddev ) + IDirect3DDevice9_Release( dxva2->d3ddev ); + if( dxva2->d3dobj ) + IDirect3D9_Release( dxva2->d3dobj ); +} +/** + * It destroys a DirectX video service + */ +static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->device ) + IDirect3DDeviceManager9_CloseDeviceHandle( dxva2->devmng, dxva2->device ); + + if( dxva2->vs ) + IDirectXVideoDecoderService_Release( dxva2->vs ); +} + +static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format ) +{ + unsigned i; + for( i = 0; d3d_formats[i].name; i++ ) + { + if( d3d_formats[i].format == format ) + return &d3d_formats[i]; + } + return NULL; +} + +static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 ) +{ + switch( dxva2->render ) + { + case MAKEFOURCC( 'N', 'V', '1', '2' ): + dxva2->output = MAKEFOURCC( 'Y', 'V', '1', '2' ); + break; + default: + dxva2->output = dxva2->render; + break; + } +} + +void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame ) +{ + LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3]; + unsigned i; + for( i = 0; i < dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + if( surface->d3d == d3d ) + surface->refcount--; + } +} + + +void hb_va_close( hb_va_dxva2_t *dxva2 ) +{ + hb_dx_destroy_video_decoder( dxva2 ); + hb_dx_destroy_video_service( dxva2 ); + hb_d3d_destroy_device_manager( dxva2 ); + hb_d3d_destroy_device( dxva2 ); + + if( dxva2->hdxva2_dll ) + FreeLibrary( dxva2->hdxva2_dll ); + if( dxva2->hd3d9_dll ) + FreeLibrary( dxva2->hd3d9_dll ); + +#ifdef USE_OPENCL + if ( dxva2->nv12toyuv_tmp_in ) + free( dxva2->nv12toyuv_tmp_in ); + if ( dxva2->nv12toyuv_tmp_out ) + free( dxva2->nv12toyuv_tmp_out ); +#endif + dxva2->description = NULL; + free( dxva2 ); +} + +/** + * It creates a DXVA2 decoder using the given video format + */ +static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt ) +{ + dxva2->width = fmt->width; + dxva2->height = fmt->height; + dxva2->surface_width = (fmt->width + 15) & ~15; + dxva2->surface_height = (fmt->height + 15) & ~15; + switch( codec_id ) + { + case AV_CODEC_ID_H264: + dxva2->surface_count = 16 + 1; + break; + default: + dxva2->surface_count = 2 + 1; + break; + } + LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT]; + if( FAILED( IDirectXVideoDecoderService_CreateSurface( dxva2->vs, + dxva2->surface_width, + dxva2->surface_height, + dxva2->surface_count - 1, + dxva2->render, + D3DPOOL_DEFAULT, + 0, + DXVA2_VideoDecoderRenderTarget, + surface_list, NULL ))) + { + hb_log( "dxva2:IDirectXVideoAccelerationService_CreateSurface failed" ); + dxva2->surface_count = 0; + return HB_WORK_ERROR; + } + + unsigned i; + for( i = 0; i<dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + surface->d3d = surface_list[i]; + surface->refcount = 0; + surface->order = 0; + } + hb_log( "dxva2:CreateSurface succeed with %d, fmt (%dx%d) surfaces (%dx%d)", dxva2->surface_count, + fmt->width, fmt->height, dxva2->surface_width, dxva2->surface_height ); + DXVA2_VideoDesc dsc; + memset( &dsc, 0, sizeof(dsc)); + dsc.SampleWidth = fmt->width; + dsc.SampleHeight = fmt->height; + dsc.Format = dxva2->render; + + if( fmt->rate> 0 && fmt->rate_base> 0 ) + { + dsc.InputSampleFreq.Numerator = fmt->rate; + dsc.InputSampleFreq.Denominator = fmt->rate_base; + } + else + { + dsc.InputSampleFreq.Numerator = 0; + dsc.InputSampleFreq.Denominator = 0; + } + + dsc.OutputFrameFreq = dsc.InputSampleFreq; + dsc.UABProtectionLevel = FALSE; + dsc.Reserved = 0; + + /* FIXME I am unsure we can let unknown everywhere */ + DXVA2_ExtendedFormat *ext = &dsc.SampleFormat; + ext->SampleFormat = 0; //DXVA2_SampleUnknown; + ext->VideoChromaSubsampling = 0; //DXVA2_VideoChromaSubsampling_Unknown; + ext->NominalRange = 0; //DXVA2_NominalRange_Unknown; + ext->VideoTransferMatrix = 0; //DXVA2_VideoTransferMatrix_Unknown; + ext->VideoLighting = 0; //DXVA2_VideoLighting_Unknown; + ext->VideoPrimaries = 0; //DXVA2_VideoPrimaries_Unknown; + ext->VideoTransferFunction = 0; //DXVA2_VideoTransFunc_Unknown; + + /* List all configurations available for the decoder */ + UINT cfg_count = 0; + DXVA2_ConfigPictureDecode *cfg_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderConfigurations( dxva2->vs, &dxva2->input, &dsc, NULL, &cfg_count, &cfg_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderConfigurations failed" ); + return HB_WORK_ERROR; + } + hb_log( "dxva2:we got %d decoder configurations", cfg_count ); + + /* Select the best decoder configuration */ + int cfg_score = 0; + for( i = 0; i < cfg_count; i++ ) + { + const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i]; + hb_log( "dxva2:configuration[%d] ConfigBitstreamRaw %d", i, cfg->ConfigBitstreamRaw ); + int score; + if( cfg->ConfigBitstreamRaw == 1 ) + score = 1; + else if( codec_id == AV_CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2 ) + score = 2; + else + continue; + if( IsEqualGUID( &cfg->guidConfigBitstreamEncryption, &DXVA_NoEncrypt )) + score += 16; + if( cfg_score < score ) + { + dxva2->cfg = *cfg; + cfg_score = score; + } + } + //my_release(cfg_list); + if( cfg_score <= 0 ) + { + hb_log( "dxva2:Failed to find a supported decoder configuration" ); + return HB_WORK_ERROR; + } + + /* Create the decoder */ + IDirectXVideoDecoder *decoder; + if( FAILED( IDirectXVideoDecoderService_CreateVideoDecoder( dxva2->vs, &dxva2->input, &dsc, &dxva2->cfg, surface_list, dxva2->surface_count, &decoder ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder failed" ); + return HB_WORK_ERROR; + } + dxva2->decoder = decoder; + hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder succeed" ); + return HB_WORK_OK; +} + +typedef HWND (WINAPI *PROCGETSHELLWND)(); +/** + * It creates a DirectX video service + */ +static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 ) +{ + LPDIRECT3D9 (WINAPI *Create9)( UINT SDKVersion ); + Create9 = (void*)GetProcAddress( dxva2->hd3d9_dll, TEXT( "Direct3DCreate9" )); + if( !Create9 ) + { + hb_log( "dxva2:Cannot locate reference to Direct3DCreate9 ABI in DLL" ); + return HB_WORK_ERROR; + } + LPDIRECT3D9 d3dobj; + d3dobj = Create9( D3D_SDK_VERSION ); + if( !d3dobj ) + { + hb_log( "dxva2:Direct3DCreate9 failed" ); + return HB_WORK_ERROR; + } + dxva2->d3dobj = d3dobj; + D3DADAPTER_IDENTIFIER9 *d3dai = &dxva2->d3dai; + if( FAILED( IDirect3D9_GetAdapterIdentifier( dxva2->d3dobj, D3DADAPTER_DEFAULT, 0, d3dai ))) + { + hb_log( "dxva2:IDirect3D9_GetAdapterIdentifier failed" ); + memset( d3dai, 0, sizeof(*d3dai)); + } + + PROCGETSHELLWND GetShellWindow; + HMODULE hUser32 = GetModuleHandle( "user32" ); + GetShellWindow = (PROCGETSHELLWND) + GetProcAddress( hUser32, "GetShellWindow" ); + + D3DPRESENT_PARAMETERS *d3dpp = &dxva2->d3dpp; + memset( d3dpp, 0, sizeof(*d3dpp)); + d3dpp->Flags = D3DPRESENTFLAG_VIDEO; + d3dpp->Windowed = TRUE; + d3dpp->hDeviceWindow = NULL; + d3dpp->SwapEffect = D3DSWAPEFFECT_DISCARD; + d3dpp->MultiSampleType = D3DMULTISAMPLE_NONE; + d3dpp->PresentationInterval = D3DPRESENT_INTERVAL_DEFAULT; + d3dpp->BackBufferCount = 0; /* FIXME what to put here */ + d3dpp->BackBufferFormat = D3DFMT_X8R8G8B8; /* FIXME what to put here */ + d3dpp->BackBufferWidth = 0; + d3dpp->BackBufferHeight = 0; + d3dpp->EnableAutoDepthStencil = FALSE; + + LPDIRECT3DDEVICE9 d3ddev; + //if (FAILED(IDirect3D9_CreateDevice(d3dobj, D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, GetShellWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED, d3dpp, &d3ddev))) + if( FAILED( IDirect3D9_CreateDevice( d3dobj, + D3DADAPTER_DEFAULT, + D3DDEVTYPE_HAL, + GetShellWindow(), + D3DCREATE_HARDWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED, + d3dpp, + &d3ddev ))) + { + hb_log( "dxva2:IDirect3D9_CreateDevice failed" ); + return HB_WORK_ERROR; + } + dxva2->d3ddev = d3ddev; + + return HB_WORK_OK; +} +/** + * It creates a Direct3D device manager + */ +static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 ) +{ + HRESULT(WINAPI *CreateDeviceManager9)( UINT *pResetToken, IDirect3DDeviceManager9 ** ); + CreateDeviceManager9 = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateDirect3DDeviceManager9" )); + + if( !CreateDeviceManager9 ) + { + hb_log( "dxva2:cannot load function" ); + return HB_WORK_ERROR; + } + + UINT token; + IDirect3DDeviceManager9 *devmng; + if( FAILED( CreateDeviceManager9( &token, &devmng ))) + { + hb_log( "dxva2:OurDirect3DCreateDeviceManager9 failed" ); + return HB_WORK_ERROR; + } + dxva2->token = token; + dxva2->devmng = devmng; + + long hr = IDirect3DDeviceManager9_ResetDevice( devmng, dxva2->d3ddev, token ); + if( FAILED( hr )) + { + hb_log( "dxva2:IDirect3DDeviceManager9_ResetDevice failed: %08x", (unsigned)hr ); + return HB_WORK_ERROR; + } + return HB_WORK_OK; +} +/** + * It creates a DirectX video service + */ +static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 ) +{ + HRESULT (WINAPI *CreateVideoService)( IDirect3DDevice9 *, REFIID riid, void **ppService ); + CreateVideoService = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateVideoService" )); + + if( !CreateVideoService ) + { + hb_log( "dxva2:cannot load function" ); + return HB_WORK_ERROR; + } + + HRESULT hr; + + HANDLE device; + hr = IDirect3DDeviceManager9_OpenDeviceHandle( dxva2->devmng, &device ); + if( FAILED( hr )) + { + hb_log( "dxva2:OpenDeviceHandle failed" ); + return HB_WORK_ERROR; + } + dxva2->device = device; + + IDirectXVideoDecoderService *vs; + hr = IDirect3DDeviceManager9_GetVideoService( dxva2->devmng, device, &IID_IDirectXVideoDecoderService, (void*)&vs ); + if( FAILED( hr )) + { + hb_log( "dxva2:GetVideoService failed" ); + return HB_WORK_ERROR; + } + dxva2->vs = vs; + + return HB_WORK_OK; +} +/** + * Find the best suited decoder mode GUID and render format. + */ +static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output ) +{ + unsigned int input_count = 0; + GUID *input_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderDeviceGuids( dxva2->vs, &input_count, &input_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderDeviceGuids failed" ); + return HB_WORK_ERROR; + } + unsigned i, j; + for( i = 0; i < input_count; i++ ) + { + const GUID *g = &input_list[i]; + const hb_dx_mode_t *mode = hb_dx_find_mode( g ); + } + + for( i = 0; dxva2_modes[i].name; i++ ) + { + const hb_dx_mode_t *mode = &dxva2_modes[i]; + if( !mode->codec || mode->codec != dxva2->codec_id ) + continue; + + int is_suported = 0; + const GUID *g; + for( g = &input_list[0]; !is_suported && g < &input_list[input_count]; g++ ) + { + is_suported = IsEqualGUID( mode->guid, g ); + } + if( !is_suported ) + continue; + + unsigned int output_count = 0; + D3DFORMAT *output_list = NULL; + if( FAILED( IDirectXVideoDecoderService_GetDecoderRenderTargets( dxva2->vs, mode->guid, &output_count, &output_list ))) + { + hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderRenderTargets failed" ); + continue; + } + for( j = 0; j < output_count; j++ ) + { + const D3DFORMAT f = output_list[j]; + const hb_d3d_format_t *format = hb_d3d_find_format( f ); + if( format ) + { + //hb_log( "dxva2:%s is supported for output", format->name ); + } + else + { + hb_log( "dxvar2:%d is supported for output (%4.4s)", f, (const char*)&f ); + } + } + + for( j = 0; d3d_formats[j].name; j++ ) + { + const hb_d3d_format_t *format = &d3d_formats[j]; + int is_suported = 0; + unsigned k; + for( k = 0; !is_suported && k < output_count; k++ ) + { + is_suported = format->format == output_list[k]; + } + if( !is_suported ) + continue; + *input = *mode->guid; + *output = format->format; + return HB_WORK_OK; + } + } + return HB_WORK_ERROR; +} +static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid ) +{ + unsigned i; + for( i = 0; dxva2_modes[i].name; i++ ) + { + if( IsEqualGUID( dxva2_modes[i].guid, guid )) + return &dxva2_modes[i]; + } + return NULL; +} + + +static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 ) +{ + if( dxva2->decoder ) + IDirectXVideoDecoder_Release( dxva2->decoder ); + dxva2->decoder = NULL; + + unsigned i; + for( i = 0; i<dxva2->surface_count; i++ ) + IDirect3DSurface9_Release( dxva2->surface[i].d3d ); + dxva2->surface_count = 0; +} +/** + * setup dxva2 +*/ +static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height ) +{ + if( dxva2->width == width && dxva2->height == height && dxva2->decoder ) + goto ok; + + hb_dx_destroy_video_decoder( dxva2 ); + *hw = NULL; + dxva2->i_chroma = 0; + + if( width <= 0 || height <= 0 ) + return HB_WORK_ERROR; + + hb_title_t fmt; + memset( &fmt, 0, sizeof(fmt)); + fmt.width = width; + fmt.height = height; + + if( hb_dx_create_video_decoder( dxva2, dxva2->codec_id, &fmt ) == HB_WORK_ERROR ) + return HB_WORK_ERROR; + dxva2->hw.decoder = dxva2->decoder; + dxva2->hw.cfg = &dxva2->cfg; + dxva2->hw.surface_count = dxva2->surface_count; + dxva2->hw.surface = dxva2->hw_surface; + + unsigned i; + for( i = 0; i < dxva2->surface_count; i++ ) + dxva2->hw.surface[i] = dxva2->surface[i].d3d; + + hb_dx_create_video_conversion( dxva2 ); + +ok: + *hw = &dxva2->hw; + const hb_d3d_format_t *output = hb_d3d_find_format( dxva2->output ); + dxva2->i_chroma = output->codec; + return HB_WORK_OK; + +} + +static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame ) +{ + unsigned i, old; + for( i = 0, old = 0; i < dxva2->surface_count; i++ ) + { + hb_va_surface_t *surface = &dxva2->surface[i]; + if( !surface->refcount ) + break; + if( surface->order < dxva2->surface[old].order ) + old = i; + } + if( i >= dxva2->surface_count ) + i = old; + + hb_va_surface_t *surface = &dxva2->surface[i]; + + surface->refcount = 1; + surface->order = dxva2->surface_order++; + + for( i = 0; i < 4; i++ ) + { + frame->data[i] = NULL; + frame->linesize[i] = 0; + if( i == 0 || i == 3 ) + frame->data[i] = (void*)surface->d3d; + } + return HB_WORK_OK; +} +/** + * nv12 to yuv of c reference + */ +static void hb_copy_from_nv12( uint8_t *dst, uint8_t *src[2], size_t src_pitch[2], unsigned width, unsigned height ) +{ + unsigned int i, j; + uint8_t *dstU, *dstV; + dstU = dst + width*height; + dstV = dstU + width*height/4; + unsigned int heithtUV, widthUV; + heithtUV = height/2; + widthUV = width/2; + + for( i = 0; i < height; i++ ) //Y + { + memcpy( dst + i * width, src[0] + i * src_pitch[0], width ); + } + for( i = 0; i < heithtUV; i++ ) + { + for( j = 0; j < widthUV; j++ ) + { + dstU[i * widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j); + dstV[i *widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j + 1); + } + } +} + +#ifdef USE_OPENCL +void hb_init_filter( cl_mem src, int srcwidth, int srcheight, uint8_t* dst, int dstwidth, int dstheight, int *crop ) +{ + T_FilterLink fl = {0}; + int STEP = srcwidth * srcheight * 3 / 2; + int OUTSTEP = dstwidth * dstheight * 3 / 2; + int HEIGHT = srcheight; + int LINESIZEY = srcwidth; + int LINESIZEUV = srcwidth / 2; + + cl_mem cl_outbuf; + + if( !hb_create_buffer( &(cl_outbuf), CL_MEM_WRITE_ONLY, OUTSTEP ) ) + { + hb_log("av_create_buffer cl_outbuf Error"); + return; + } + + fl.cl_outbuf = cl_outbuf; + + scale_run( src, fl.cl_outbuf, LINESIZEY, LINESIZEUV, HEIGHT ); + + hb_read_opencl_buffer( fl.cl_outbuf, dst, OUTSTEP ); + CL_FREE( cl_outbuf ); + + return; +} +#endif +/** + * lock frame data form surface. + * nv12 to yuv with opencl and with C reference + * scale with opencl + */ +int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine ) + +{ + LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3]; + D3DLOCKED_RECT lock; + if( FAILED( IDirect3DSurface9_LockRect( d3d, &lock, NULL, D3DLOCK_READONLY ))) + { + hb_log( "dxva2:Failed to lock surface" ); + return HB_WORK_ERROR; + } + + if( dxva2->render == MAKEFOURCC( 'N', 'V', '1', '2' )) + { + uint8_t *plane[2] = + { + lock.pBits, + (uint8_t*)lock.pBits + lock.Pitch * dxva2->surface_height + }; + size_t pitch[2] = + { + lock.Pitch, + lock.Pitch, + }; + + hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height ); + } + IDirect3DSurface9_UnlockRect( d3d ); + return HB_WORK_OK; +} + +/** + * create dxva2 service + * load library D3D9.dll + */ +hb_va_dxva2_t * hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id ) +{ + if( dxva2 ) + { + hb_va_close( dxva2 ); + dxva2 = NULL; + } + + hb_va_dxva2_t *dxva = calloc( 1, sizeof(*dxva) ); + if( !dxva ) return NULL; + dxva->codec_id = codec_id; + + dxva->hd3d9_dll = LoadLibrary( TEXT( "D3D9.DLL" ) ); + if( !dxva->hd3d9_dll ) + { + hb_log( "dxva2:cannot load d3d9.dll" ); + goto error; + } + dxva->hdxva2_dll = LoadLibrary( TEXT( "DXVA2.DLL" ) ); + if( !dxva->hdxva2_dll ) + { + hb_log( "dxva2:cannot load DXVA2.dll" ); + goto error; + } + + if( hb_d3d_create_device( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:Failed to create Direct3D device" ); + goto error; + } + + if( hb_d3d_create_device_manager( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:D3dCreateDeviceManager failed" ); + goto error; + } + + + if( hb_dx_create_video_service( dxva ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:DxCreateVideoService failed" ); + goto error; + } + + if( hb_dx_find_video_service_conversion( dxva, &dxva->input, &dxva->render ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:DxFindVideoServiceConversion failed" ); + goto error; + } + + dxva->do_job = HB_WORK_OK; + dxva->description = "DXVA2"; + + return dxva; + +error: + hb_va_close( dxva ); + return NULL; +} + +void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context ) +{ + if( p_context->width > 0 && p_context->height > 0 ) + { + if( hb_va_setup( dxva2, &p_context->hwaccel_context, p_context->width, p_context->height ) == HB_WORK_ERROR ) + { + hb_log( "dxva2:hb_va_Setup failed" ); + hb_va_close( dxva2 ); + dxva2 = NULL; + } + } + if( dxva2 ) + { + dxva2->input_pts[0] = 0; + dxva2->input_pts[1] = 0; + if( dxva2->description ) + hb_log( "dxva2:Using %s for hardware decoding", dxva2->description ); + p_context->draw_horiz_band = NULL; + } + +} + +char* hb_get_pix_fmt_name( int pix_fmt ) +{ + static const char *ppsz_name[AV_PIX_FMT_NB] = + { + [AV_PIX_FMT_VDPAU_H264] = "AV_PIX_FMT_VDPAU_H264", + [AV_PIX_FMT_VAAPI_IDCT] = "AV_PIX_FMT_VAAPI_IDCT", + [AV_PIX_FMT_VAAPI_VLD] = "AV_PIX_FMT_VAAPI_VLD", + [AV_PIX_FMT_VAAPI_MOCO] = "AV_PIX_FMT_VAAPI_MOCO", + [AV_PIX_FMT_DXVA2_VLD] = "AV_PIX_FMT_DXVA2_VLD", + [AV_PIX_FMT_YUYV422] = "AV_PIX_FMT_YUYV422", + [AV_PIX_FMT_YUV420P] = "AV_PIX_FMT_YUV420P", + }; + + return ppsz_name[pix_fmt]; +} + +enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *p_context, const enum PixelFormat *pi_fmt ) +{ + int i; + for( i = 0; pi_fmt[i] != AV_PIX_FMT_NONE; i++ ) + { + hb_log( "dxva2:Available decoder output format %d (%s)", pi_fmt[i], hb_get_pix_fmt_name(pi_fmt[i]) ? : "Unknown" ); + if( pi_fmt[i] == AV_PIX_FMT_DXVA2_VLD ) + { + return pi_fmt[i]; + } + } + return avcodec_default_get_format( p_context, pi_fmt ); +} + +int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame ) +{ + frame->type = FF_BUFFER_TYPE_USER; + if( hb_va_get( dxva2, frame ) == HB_WORK_ERROR ) + { + hb_log( "VaGrabSurface failed" ); + return HB_WORK_ERROR; + } + return HB_WORK_OK; + +} + +int hb_check_hwd_fmt( int fmt ) +{ + int result = 1; + switch ( fmt ) + { + case AV_PIX_FMT_YUV420P16LE: + case AV_PIX_FMT_YUV420P16BE: + case AV_PIX_FMT_YUV422P16LE: + case AV_PIX_FMT_YUV422P16BE: + case AV_PIX_FMT_YUV444P16LE: + case AV_PIX_FMT_YUV444P16BE: + case AV_PIX_FMT_YUV420P9BE: + case AV_PIX_FMT_YUV420P9LE: + case AV_PIX_FMT_YUV420P10BE: + case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_YUV422P10BE: + case AV_PIX_FMT_YUV422P10LE: + case AV_PIX_FMT_YUV444P9BE: + case AV_PIX_FMT_YUV444P9LE: + case AV_PIX_FMT_YUV444P10BE: + case AV_PIX_FMT_YUV444P10LE: + case AV_PIX_FMT_YUV422P9BE: + case AV_PIX_FMT_YUV422P9LE: + case AV_PIX_FMT_GBRP9BE: + case AV_PIX_FMT_GBRP9LE: + case AV_PIX_FMT_GBRP10BE: + case AV_PIX_FMT_GBRP10LE: + case AV_PIX_FMT_GBRP16BE: + case AV_PIX_FMT_GBRP16LE: + case AV_PIX_FMT_YUVA420P9BE: + case AV_PIX_FMT_YUVA420P9LE: + case AV_PIX_FMT_YUVA422P9BE: + case AV_PIX_FMT_YUVA422P9LE: + case AV_PIX_FMT_YUVA444P9BE: + case AV_PIX_FMT_YUVA444P9LE: + case AV_PIX_FMT_YUVA420P10BE: + case AV_PIX_FMT_YUVA420P10LE: + result = 0; + } + return result; +} +#endif diff --git a/libhb/vadxva2.h b/libhb/vadxva2.h new file mode 100644 index 000000000..cd879b974 --- /dev/null +++ b/libhb/vadxva2.h @@ -0,0 +1,213 @@ +/* vadxva2.h + + Copyright (c) 2003-2012 HandBrake Team + This file is part of the HandBrake source code + Homepage: <http://handbrake.fr/>. + It may be used under the terms of the GNU General Public License v2. + For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html + + Authors: Peng Gao <[email protected]> <http://www.multicorewareinc.com/> + Li Cao <[email protected]> <http://www.multicorewareinc.com/> + + */ + +#ifndef VA_DXVA2_H +#define VA_DXVA2_H + +#ifdef USE_HWD +#include "hbffmpeg.h" +#include "d3d9.h" +#include "libavcodec/dxva2.h" +#include "dxva2api.h" +#include "common.h" +#include "openclwrapper.h" + +#define HB_FOURCC( a, b, c, d ) ( ((uint32_t)a) | ( ((uint32_t)b) << 8 ) | ( ((uint32_t)c) << 16 ) | ( ((uint32_t)d) << 24 ) ) +#define MAKEFOURCC( a, b, c, d ) ((DWORD)(BYTE)(a) | ((DWORD)(BYTE)(b) << 8) | ((DWORD)(BYTE)(c) << 16) | ((DWORD)(BYTE)(d) << 24 )) +#define HB_CODEC_YV12 HB_FOURCC( 'Y', 'V', '1', '2' ) +#define HB_CODEC_NV12 HB_FOURCC( 'N', 'V', '1', '2' ) +#define DXVA2_E_NOT_INITIALIZED MAKE_HRESULT( 1, 4, 4096 ) +#define DXVA2_E_NEW_VIDEO_DEVICE MAKE_HRESULT( 1, 4, 4097 ) +#define DXVA2_E_VIDEO_DEVICE_LOCKED MAKE_HRESULT( 1, 4, 4098 ) +#define DXVA2_E_NOT_AVAILABLE MAKE_HRESULT( 1, 4, 4099 ) +#define VA_DXVA2_MAX_SURFACE_COUNT (64) + +static const GUID DXVA_NoEncrypt = { 0x1b81bed0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID IID_IDirectXVideoDecoderService = {0xfc51a551, 0xd5e7, 0x11d9, {0xaf, 0x55, 0x00, 0x05, 0x4e, 0x43, 0xff, 0x02}}; +static const GUID DXVA2_ModeMPEG2_MoComp = { 0xe6a9f44b, 0x61b0, 0x4563, {0x9e, 0xa4, 0x63, 0xd2, 0xa3, 0xc6, 0xfe, 0x66} }; +static const GUID DXVA2_ModeMPEG2_IDCT = { 0xbf22ad00, 0x03ea, 0x4690, {0x80, 0x77, 0x47, 0x33, 0x46, 0x20, 0x9b, 0x7e} }; +static const GUID DXVA2_ModeMPEG2_VLD = { 0xee27417f, 0x5e28, 0x4e65, {0xbe, 0xea, 0x1d, 0x26, 0xb5, 0x08, 0xad, 0xc9} }; +static const GUID DXVA2_ModeH264_A = { 0x1b81be64, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_B = { 0x1b81be65, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_C = { 0x1b81be66, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_D = { 0x1b81be67, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_E = { 0x1b81be68, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeH264_F = { 0x1b81be69, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVADDI_Intel_ModeH264_A = { 0x604F8E64, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVADDI_Intel_ModeH264_C = { 0x604F8E66, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVADDI_Intel_ModeH264_E = { 0x604F8E68, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} }; +static const GUID DXVA2_ModeWMV8_A = { 0x1b81be80, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV8_B = { 0x1b81be81, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_A = { 0x1b81be90, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_B = { 0x1b81be91, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeWMV9_C = { 0x1b81be94, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_A = { 0x1b81beA0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_B = { 0x1b81beA1, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_C = { 0x1b81beA2, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; +static const GUID DXVA2_ModeVC1_D = { 0x1b81beA3, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} }; + +typedef struct +{ + int width; + int height; + int rate; + int rate_base; + +}hb_dx_format; + +typedef struct +{ + LPDIRECT3DSURFACE9 d3d; + int refcount; + unsigned int order; + +} hb_va_surface_t; + +typedef struct +{ + uint8_t *base; + uint8_t *buffer; + size_t size; + +} hb_copy_cache_t; + +typedef struct +{ + const char *name; + D3DFORMAT format; + uint32_t codec; + +} hb_d3d_format_t; + +typedef struct +{ + const char *name; + const GUID *guid; + int codec; +} hb_dx_mode_t; + +typedef struct +{ + char *description; + int codec_id; + uint32_t i_chroma; + int width; + int height; + HINSTANCE hd3d9_dll; + HINSTANCE hdxva2_dll; + D3DPRESENT_PARAMETERS d3dpp; + LPDIRECT3D9 d3dobj; + D3DADAPTER_IDENTIFIER9 d3dai; + LPDIRECT3DDEVICE9 d3ddev; + UINT token; + IDirect3DDeviceManager9 *devmng; + HANDLE device; + IDirectXVideoDecoderService *vs; + GUID input; + D3DFORMAT render; + DXVA2_ConfigPictureDecode cfg; + IDirectXVideoDecoder *decoder; + D3DFORMAT output; + struct dxva_context hw; + unsigned surface_count; + unsigned surface_order; + int surface_width; + int surface_height; + uint32_t surface_chroma; + hb_va_surface_t surface[VA_DXVA2_MAX_SURFACE_COUNT]; + LPDIRECT3DSURFACE9 hw_surface[VA_DXVA2_MAX_SURFACE_COUNT]; + IDirectXVideoProcessorService *ps; + IDirectXVideoProcessor *vp; + int64_t input_pts[2]; + int64_t input_dts; + int do_job; + + // running nv12toyuv kernel. +#ifdef USE_OPENCL + cl_kernel nv12toyuv; + cl_mem cl_mem_nv12; + cl_mem cl_mem_yuv; + uint8_t * nv12toyuv_tmp_in; + uint8_t * nv12toyuv_tmp_out; +#endif +} hb_va_dxva2_t; + +typedef struct FilterLink_T +{ +#ifdef USE_OPENCL + cl_mem cl_inbuf; + cl_mem cl_outbuf; +#endif + uint8_t *mem_inbuf; + uint8_t *mem_outbuf; + int width; + int height; + int linesizeY; + int linesizeUV; + int inmemdataflag; + int outmemdataflag; + int incldataflag; + int outcldataflag; + int framenum; + int outputSize; +} T_FilterLink; + +static const hb_d3d_format_t d3d_formats[] = +{ + { "YV12", MAKEFOURCC( 'Y', 'V', '1', '2' ), HB_CODEC_YV12 }, + { "NV12", MAKEFOURCC( 'N', 'V', '1', '2' ), HB_CODEC_NV12 }, + { NULL, 0, 0 } +}; + +static const hb_dx_mode_t dxva2_modes[] = +{ + { "DXVA2_ModeMPEG2_VLD", &DXVA2_ModeMPEG2_VLD, AV_CODEC_ID_MPEG2VIDEO }, + { "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp, 0 }, + { "DXVA2_ModeMPEG2_IDCT", &DXVA2_ModeMPEG2_IDCT, 0 }, + + { "H.264 variable-length decoder (VLD), FGT", &DXVA2_ModeH264_F, AV_CODEC_ID_H264 }, + { "H.264 VLD, no FGT", &DXVA2_ModeH264_E, AV_CODEC_ID_H264 }, + { "H.264 VLD, no FGT (Intel)", &DXVADDI_Intel_ModeH264_E, AV_CODEC_ID_H264 }, + { "H.264 IDCT, FGT", &DXVA2_ModeH264_D, 0 }, + { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0 }, + { "H.264 inverse discrete cosine transform (IDCT), no FGT (Intel)", &DXVADDI_Intel_ModeH264_C, 0 }, + { "H.264 MoComp, FGT", &DXVA2_ModeH264_B, 0 }, + { "H.264 motion compensation (MoComp), no FGT", &DXVA2_ModeH264_A, 0 }, + { "H.264 motion compensation (MoComp), no FGT (Intel)", &DXVADDI_Intel_ModeH264_A, 0 }, + + { "Windows Media Video 8 MoComp", &DXVA2_ModeWMV8_B, 0 }, + { "Windows Media Video 8 post processing", &DXVA2_ModeWMV8_A, 0 }, + + { "Windows Media Video 9 IDCT", &DXVA2_ModeWMV9_C, 0 }, + { "Windows Media Video 9 MoComp", &DXVA2_ModeWMV9_B, 0 }, + { "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 }, + + { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_VC1 }, + { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_WMV3 }, + { "VC-1 IDCT", &DXVA2_ModeVC1_C, 0 }, + { "VC-1 MoComp", &DXVA2_ModeVC1_B, 0 }, + { "VC-1 post processing", &DXVA2_ModeVC1_A, 0 }, + + { NULL, NULL, 0 } +}; + +int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame ); +int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine ); +enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *, const enum PixelFormat * ); +hb_va_dxva2_t *hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id ); +void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context ); +void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame ); +void hb_va_close( hb_va_dxva2_t *dxva2 ); +int hb_check_hwd_fmt( int fmt ); +#endif +#endif diff --git a/libhb/work.c b/libhb/work.c index 7f00ddab3..d44f763be 100644 --- a/libhb/work.c +++ b/libhb/work.c @@ -10,6 +10,7 @@ #include "hb.h" #include "a52dec/a52.h" #include "libavformat/avformat.h" +#include "openclwrapper.h" #ifdef USE_QSV #include "qsv_common.h" @@ -532,8 +533,24 @@ static void do_job(hb_job_t *job) job->list_work = hb_list_init(); +#ifdef USE_OPENCL + /* init opencl environment */ + if (job->use_opencl) + job->use_opencl = !hb_init_opencl_run_env(0, NULL, "-I."); +#else + job->use_opencl = 0; +#endif + hb_log( "starting job" ); + if (job->use_opencl || job->use_hwd) + { + hb_log("Using GPU: Yes."); + } + else + { + hb_log("Using GPU: No."); + } /* Look for the scanned subtitle in the existing subtitle list * select_subtitle implies that we did a scan. */ if( !job->indepth_scan && interjob->select_subtitle ) @@ -792,6 +809,9 @@ static void do_job(hb_job_t *job) init.pix_fmt = AV_PIX_FMT_YUV420P; init.width = title->width; init.height = title->height; +#ifdef USE_OPENCL + init.use_dxva = hb_use_dxva( title ); +#endif init.par_width = job->anamorphic.par_width; init.par_height = job->anamorphic.par_height; memcpy(init.crop, title->crop, sizeof(int[4])); @@ -1093,6 +1113,7 @@ static void do_job(hb_job_t *job) title->video_codec_param = AV_CODEC_ID_MPEG2VIDEO; } #endif + hb_list_add( job->list_work, ( w = hb_get_work( vcodec ) ) ); w->codec_param = title->video_codec_param; w->fifo_in = job->fifo_mpeg2; @@ -1554,7 +1575,6 @@ static inline void copy_chapter( hb_buffer_t * dst, hb_buffer_t * src ) if( src && dst && src->s.start == dst->s.start) { // restore log below to debug chapter mark propagation problems - //hb_log("work %s: Copying Chapter Break @ %"PRId64, w->name, src->s.start); dst->s.new_chap = src->s.new_chap; } } |