/* openclwrapper.c

   Copyright (c) 2003-2012 HandBrake Team
   This file is part of the HandBrake source code
   Homepage: <http://handbrake.fr/>.
   It may be used under the terms of the GNU General Public License v2.
   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html

   Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
            Li   Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
 */
 
#ifdef USE_OPENCL

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "openclwrapper.h"
#include "openclkernels.h"

//#define USE_EXTERNAL_KERNEL
#ifdef SYS_MINGW
#include <windows.h>
#endif

#if defined(__APPLE__)
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#if defined(_MSC_VER)
#define strcasecmp strcmpi
#endif

#define MAX_KERNEL_STRING_LEN 64
#define MAX_CLFILE_NUM 50
#define MAX_CLKERNEL_NUM 200
#define MAX_CLFILE_PATH 255
#define MAX_KERNEL_NUM  50
#define MAX_KERNEL_NAME_LEN 64

#ifndef INVALID_HANDLE_VALUE
#define INVALID_HANDLE_VALUE NULL
#endif

//#define THREAD_PRIORITY_TIME_CRITICAL 15

enum VENDOR
{
    AMD = 0,
    Intel,
    NVIDIA,
    others
};
typedef struct _GPUEnv
{
    //share vb in all modules in hb library
    cl_platform_id platform;
    cl_device_type dType;
    cl_context context;
    cl_device_id * devices;
    cl_device_id dev;
    cl_command_queue command_queue;
    cl_kernel kernels[MAX_CLFILE_NUM];
    cl_program programs[MAX_CLFILE_NUM]; //one program object maps one kernel source file
    char kernelSrcFile[MAX_CLFILE_NUM][256]; //the max len of kernel file name is 256
    int file_count; // only one kernel file

    char kernel_names[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN+1];
    cl_kernel_function kernel_functions[MAX_CLKERNEL_NUM];
    int kernel_count;
    int isUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper
    enum VENDOR vendor;
}GPUEnv;

typedef struct
{
    char kernelName[MAX_KERNEL_NAME_LEN+1];
    char * kernelStr;
}hb_kernel_node;

static GPUEnv gpu_env;
static int isInited = 0;
static int useBuffers = 0;
static hb_kernel_node gKernels[MAX_KERNEL_NUM];

#define ADD_KERNEL_CFG( idx, s, p ){\
        strcpy( gKernels[idx].kernelName, s );\
        gKernels[idx].kernelStr = p;\
        strcpy( gpu_env.kernel_names[idx], s );\
        gpu_env.kernel_count++; }


/**
 * hb_confirm_gpu_type
 */
int hb_confirm_gpu_type()
{
    int status = 1;
    unsigned int i, j;
    cl_uint numPlatforms = 0; 
    status = clGetPlatformIDs(0,NULL,&numPlatforms); 
    if(status != 0) 
    { 
        goto end; 
    } 
    if(numPlatforms > 0) 
    { 
        cl_platform_id* platforms = (cl_platform_id* )malloc (numPlatforms * sizeof(cl_platform_id)); 
        status = clGetPlatformIDs (numPlatforms, platforms, NULL); 
        if (status != 0) 
        { 
            goto end; 
        } 
        for (i=0; i < numPlatforms; i++)
        { 
            char pbuff[100];
            cl_uint numDevices;
            status = clGetPlatformInfo( platforms[i], 
                                        CL_PLATFORM_VENDOR, 
                                        sizeof (pbuff), 
                                        pbuff,
                                        NULL); 
            if (status)
                continue;
            status = clGetDeviceIDs( platforms[i], 
                                     CL_DEVICE_TYPE_GPU , 
                                     0 , 
                                     NULL , 
                                     &numDevices); 
            
            cl_device_id *devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
            status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
            for (j = 0; j < numDevices; j++)
            {
                char dbuff[100];
                status = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(dbuff), dbuff, NULL); 
                if (!strcmp(dbuff, "Advanced Micro Devices, Inc.") ||
                    !strcmp(dbuff, "Intel(R) Corporation")         ||
#ifdef __APPLE__
                    !strcmp(dbuff, "AMD")                          ||
                    /* MacBook Pro, AMD ATI Radeon HD 6750M, OS X 10.8.3 */
                    !strcmp(dbuff, "NVIDIA")                       ||
                    /* MacBook Pro, NVIDIA GeForce GT 330M,  OS X 10.7.4 */
#endif
                    !strcmp(dbuff, "NVIDIA Corporation"))
                {
                    return 0;
                }
            }

            if ( status != CL_SUCCESS )
                continue;
            if( numDevices ) 
                break; 
        } 
        free( platforms ); 
    } 
    end:
    return -1;
}

/**
 * hb_regist_opencl_kernel
 */
int hb_regist_opencl_kernel()
{
    //if( !gpu_env.isUserCreated )
    //    memset( &gpu_env, 0, sizeof(gpu_env) );
    //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources.

    gpu_env.file_count = 0; //argc;
    gpu_env.kernel_count = 0UL;

    ADD_KERNEL_CFG( 0, "frame_scale", NULL )
    ADD_KERNEL_CFG( 1, "yadif_filter", NULL )

    return 0;
}

/**
 * hb_regist_opencl_kernel
 * @param filename -
 * @param source -
 * @param gpu_info -
 * @param int idx -
 */
int hb_convert_to_string( const char *filename, char **source, GPUEnv *gpu_info, int idx )
{
    int file_size;
    size_t result;
    FILE * file = NULL;
    file_size = 0;
    result = 0;
    file = fopen( filename, "rb+" );

    if( file!=NULL )
    {
        fseek( file, 0, SEEK_END );

        file_size = ftell( file );
        rewind( file );
        *source = (char*)malloc( sizeof(char) * file_size + 1 );
        if( *source == (char*)NULL )
        {
            return(0);
        }
        result = fread( *source, 1, file_size, file );
        if( result != file_size )
        {
            free( *source );
            return(0);
        }
        (*source)[file_size] = '\0';
        fclose( file );

        return(1);
    }
    return(0);
}

/**
 * hb_binary_generated
 * @param context -
 * @param cl_file_name -
 * @param fhandle -
 */
int hb_binary_generated( cl_context context, const char * cl_file_name, FILE ** fhandle )
{
    int i = 0;
    cl_int status;
    cl_uint numDevices;
    cl_device_id *devices;
    char * str = NULL;
    FILE * fd = NULL;

    status = clGetContextInfo( context,
                               CL_CONTEXT_NUM_DEVICES,
                               sizeof(numDevices),
                               &numDevices,
                               NULL );
    if( status != CL_SUCCESS )
    {
        hb_log( "OpenCL: Get context info failed" );
        return 0;
    }

    devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
    if( devices == NULL )
    {
        hb_log( "OpenCL: No device found" );
        return 0;
    }

    /* grab the handles to all of the devices in the context. */
    status = clGetContextInfo( context,
                               CL_CONTEXT_DEVICES,
                               sizeof(cl_device_id) * numDevices,
                               devices,
                               NULL );

    status = 0;
    /* dump out each binary into its own separate file. */
    for (i = 0; i < numDevices; i++)
    {
        char fileName[256] = { 0 };
        char  cl_name[128] = { 0 };
        if (devices[i])
        {
            char deviceName[1024];
            status = clGetDeviceInfo(devices[i],
                                     CL_DEVICE_NAME,
                                     sizeof(deviceName),
                                     deviceName,
                                     NULL);

            str = (char*)strstr(cl_file_name, ".cl");
            memcpy(cl_name, cl_file_name, str - cl_file_name);
            cl_name[str - cl_file_name] = '\0';
            sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);
            fd = fopen(fileName, "rb");
            status = fd != NULL;
        }
    }

    if( devices != NULL )
    {
        free( devices );
        devices = NULL;
    }

    if( fd != NULL )
        *fhandle = fd;

    return status;
}

/**
 * hb_write_binary_to_file
 * @param fileName -
 * @param birary -
 * @param numBytes -
 */
int hb_write_binary_to_file( const char* fileName, const char* birary, size_t numBytes )
{
    FILE *output = NULL;
    output = fopen( fileName, "wb" );
    if( output == NULL )
        return 0;

    fwrite( birary, sizeof(char), numBytes, output );
    fclose( output );

    return 1;
}

/**
 * hb_generat_bin_from_kernel_source
 * @param program -
 * @param cl_file_name -
 */
int hb_generat_bin_from_kernel_source( cl_program program, const char * cl_file_name )
{
    int i = 0;
    cl_int status;
    cl_uint numDevices;
    size_t *binarySizes;
    cl_device_id *devices;
    char **binaries;
    char *str = NULL;

    status = clGetProgramInfo( program,
                               CL_PROGRAM_NUM_DEVICES,
                               sizeof(numDevices),
                               &numDevices,
                               NULL );
    if( status != CL_SUCCESS )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_NUM_DEVICES failed");
        return 0;
    }

    devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
    if( devices == NULL )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: no device found");
        return 0;
    }

    /* grab the handles to all of the devices in the program. */
    status = clGetProgramInfo( program,
                               CL_PROGRAM_DEVICES,
                               sizeof(cl_device_id) * numDevices,
                               devices,
                               NULL );
    if( status != CL_SUCCESS )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_DEVICES failed");
        return 0;
    }

    /* figure out the sizes of each of the binaries. */
    binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );

    status = clGetProgramInfo( program,
                               CL_PROGRAM_BINARY_SIZES,
                               sizeof(size_t) * numDevices,
                               binarySizes, NULL );
    if( status != CL_SUCCESS )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARY_SIZES failed");
        return 0;
    }

    /* copy over all of the generated binaries. */
    binaries = (char**)malloc( sizeof(char *) * numDevices );
    if( binaries == NULL )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries failed");
        return 0;
    }

    for( i = 0; i < numDevices; i++ )
    {
        if( binarySizes[i] != 0 )
        {
            binaries[i] = (char*)malloc( sizeof(char) * binarySizes[i] );
            if( binaries[i] == NULL )
            {
                hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries[%d] failed", i);
                return 0;
            }
        }
        else
        {
            binaries[i] = NULL;
        }
    }

    status = clGetProgramInfo( program,
                               CL_PROGRAM_BINARIES,
                               sizeof(char *) * numDevices,
                               binaries,
                               NULL );
    if( status != CL_SUCCESS )
    {
        hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARIES failed");
        return 0;
    }

    /* dump out each binary into its own separate file. */
    for (i = 0; i < numDevices; i++)
    {
        char fileName[256] = {0};
        char  cl_name[128] = {0};
        if (binarySizes[i])
        {
            char deviceName[1024];
            status = clGetDeviceInfo(devices[i],
                                     CL_DEVICE_NAME,
                                     sizeof(deviceName),
                                     deviceName,
                                     NULL);

            str = (char*)strstr( cl_file_name, (char*)".cl" );
            memcpy(cl_name, cl_file_name, str - cl_file_name);
            cl_name[str - cl_file_name] = '\0';
            sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);

            if (!hb_write_binary_to_file(fileName, binaries[i], binarySizes[i]))
            {
                hb_log("OpenCL: hb_generat_bin_from_kernel_source: unable to write kernel, writing to temporary directory instead.");
                return 0;
            }
        }
    }

    // Release all resouces and memory
    for( i = 0; i < numDevices; i++ )
    {
        if( binaries[i] != NULL )
        {
            free( binaries[i] );
            binaries[i] = NULL;
        }
    }

    if( binaries != NULL )
    {
        free( binaries );
        binaries = NULL;
    }

    if( binarySizes != NULL )
    {
        free( binarySizes );
        binarySizes = NULL;
    }

    if( devices != NULL )
    {
        free( devices );
        devices = NULL;
    }
    return 1;
}


/**
 * hb_init_opencl_attr
 * @param env -
 */
int hb_init_opencl_attr( OpenCLEnv * env )
{
    if( gpu_env.isUserCreated )
        return 1;

    gpu_env.context = env->context;
    gpu_env.platform = env->platform;
    gpu_env.dev = env->devices;
    gpu_env.command_queue = env->command_queue;

    gpu_env.isUserCreated = 1;

    return 0;
}

/**
 * hb_create_kernel
 * @param kernelname -
 * @param env -
 */
int hb_create_kernel( char * kernelname, KernelEnv * env )
{
    int status;
    env->kernel = clCreateKernel( gpu_env.programs[0], kernelname, &status );
    env->context = gpu_env.context;
    env->command_queue = gpu_env.command_queue;
    return status != CL_SUCCESS ? 1 : 0;
}

/**
 * hb_release_kernel
 * @param env -
 */
int hb_release_kernel( KernelEnv * env )
{
    int status = clReleaseKernel( env->kernel );
    return status != CL_SUCCESS ? 1 : 0;
}

/**
 * hb_init_opencl_env
 * @param gpu_info -
 */

static int init_once = 0;
int hb_init_opencl_env( GPUEnv *gpu_info )
{
    size_t length;
    cl_int status;
    cl_uint numPlatforms, numDevices;
    cl_platform_id *platforms;
    cl_context_properties cps[3];
    char platformName[100];
    unsigned int i;
    void *handle = INVALID_HANDLE_VALUE;


    if (init_once != 0)
        return 0;
    else
        init_once = 1;
    /*
     * Have a look at the available platforms.
     */
    if( !gpu_info->isUserCreated )
    {
        status = clGetPlatformIDs( 0, NULL, &numPlatforms );
        if( status != CL_SUCCESS )
        {
            hb_log( "OpenCL: OpenCL device platform not found." );
            return(1);
        }

        gpu_info->platform = NULL;
        if( 0 < numPlatforms )
        {
            platforms = (cl_platform_id*)malloc(
                numPlatforms * sizeof(cl_platform_id));
            if( platforms == (cl_platform_id*)NULL )
            {
                return(1);
            }
            status = clGetPlatformIDs( numPlatforms, platforms, NULL );

            if( status != CL_SUCCESS )
            {
                hb_log( "OpenCL: Specific opencl platform not found." );
                return(1);
            }

            for( i = 0; i < numPlatforms; i++ )
            {
                status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
                                            sizeof(platformName), platformName,
                                            NULL );

                if( status != CL_SUCCESS )
                {
                    continue;
                }
                gpu_info->platform = platforms[i];

                if (!strcmp(platformName, "Advanced Micro Devices, Inc.") ||
                    !strcmp(platformName, "AMD"))
                    gpu_info->vendor = AMD;
                else 
                    gpu_info->vendor = others;
                
                gpu_info->platform = platforms[i];

                status = clGetDeviceIDs( gpu_info->platform /* platform */,
                                         CL_DEVICE_TYPE_GPU /* device_type */,
                                         0 /* num_entries */,
                                         NULL /* devices */,
                                         &numDevices );

                if( status != CL_SUCCESS )
                {
                    continue;
                }

                if( numDevices )
                        break;
                
            }
            free( platforms );
        }

        if( NULL == gpu_info->platform )
        {
            hb_log( "OpenCL: No OpenCL-compatible GPU found." );
            return(1);
        }

        if( status != CL_SUCCESS )
        {
            hb_log( "OpenCL: No OpenCL-compatible GPU found." );
            return(1);
        }

        /*
         * Use available platform.
         */
        cps[0] = CL_CONTEXT_PLATFORM;
        cps[1] = (cl_context_properties)gpu_info->platform;
        cps[2] = 0;
        /* Check for GPU. */
        gpu_info->dType = CL_DEVICE_TYPE_GPU;
        gpu_info->context = clCreateContextFromType(
            cps, gpu_info->dType, NULL, NULL, &status );

        if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
        {
            gpu_info->dType = CL_DEVICE_TYPE_CPU;
            gpu_info->context = clCreateContextFromType(
                cps, gpu_info->dType, NULL, NULL, &status );
        }

        if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
        {
            gpu_info->dType = CL_DEVICE_TYPE_DEFAULT;
            gpu_info->context = clCreateContextFromType(
                cps, gpu_info->dType, NULL, NULL, &status );
        }

        if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )
        {
            hb_log( "OpenCL: Unable to create opencl context." );
            return(1);
        }

        /* Detect OpenCL devices. */
        /* First, get the size of device list data */
        status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES,
                                   0, NULL, &length );
        if((status != CL_SUCCESS) || (length == 0))
        {
            hb_log( "OpenCL: Unable to get the list of devices in context." );
            return(1);
        }

        /* Now allocate memory for device list based on the size we got earlier */
        gpu_info->devices = (cl_device_id*)malloc( length );
        if( gpu_info->devices == (cl_device_id*)NULL )
        {
            return(1);
        }

        /* Now, get the device list data */
        status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES, length,
                                   gpu_info->devices, NULL );
        if( status != CL_SUCCESS )
        {
            hb_log( "OpenCL: Unable to get the device list data in context." );
            return(1);
        }

        /* Create OpenCL command queue. */
        gpu_info->command_queue = clCreateCommandQueue( gpu_info->context,
                                                        gpu_info->devices[0],
                                                        0, &status );
        if( status != CL_SUCCESS )
        {
            hb_log( "OpenCL: Unable to create opencl command queue." );
            return(1);
        }
    }

    if( clGetCommandQueueInfo( gpu_info->command_queue,
                               CL_QUEUE_THREAD_HANDLE_AMD, sizeof(handle),
                               &handle, NULL ) == CL_SUCCESS && handle != INVALID_HANDLE_VALUE )
    {
#ifdef SYS_MINGW 
        SetThreadPriority( handle, THREAD_PRIORITY_TIME_CRITICAL );
#endif
    }

    return 0;
}


/**
 * hb_release_opencl_env
 * @param gpu_info -
 */
int hb_release_opencl_env( GPUEnv *gpu_info )
{
    if( !isInited )
        return 1;
    int i;

    for( i = 0; i<gpu_env.file_count; i++ )
    {
        if( gpu_env.programs[i] ) ;
        {
            clReleaseProgram( gpu_env.programs[i] );
            gpu_env.programs[i] = NULL;
        }
    }

    if( gpu_env.command_queue )
    {
        clReleaseCommandQueue( gpu_env.command_queue );
        gpu_env.command_queue = NULL;
    }

    if( gpu_env.context )
    {
        clReleaseContext( gpu_env.context );
        gpu_env.context = NULL;
    }

    isInited = 0;
    gpu_info->isUserCreated = 0;
    return 1;
}


/**
 * hb_register_kernel_wrapper
 * @param kernel_name -
 * @param function -
 */
int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function )
{
    int i;
    for( i = 0; i < gpu_env.kernel_count; i++ )
    {
        if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
        {
            gpu_env.kernel_functions[i] = function;
            return(1);
        }
    }
    return(0);
}

/**
 * hb_cached_of_kerner_prg
 * @param gpu_env -
 * @param cl_file_name -
 */
int hb_cached_of_kerner_prg( const GPUEnv *gpu_env, const char * cl_file_name )
{
    int i;
    for( i = 0; i < gpu_env->file_count; i++ )
    {
        if( strcasecmp( gpu_env->kernelSrcFile[i], cl_file_name ) == 0 )
        {
            if( gpu_env->programs[i] != NULL )
                return(1);
        }
    }

    return(0);
}

/**
 * hb_compile_kernel_file
 * @param filename -
 * @param gpu_info -
 * @param indx -
 * @param build_option -
 */
int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,
                            int indx, const char *build_option )
{
    cl_int status;
    size_t length;
    char *source_str;
    const char *source;
    size_t source_size[1];
    char *buildLog = NULL;
    int b_error, binary_status, binaryExisted;
    char * binary;
    cl_uint numDevices;
    cl_device_id *devices;
    FILE * fd;
    FILE * fd1;
    int idx;

    if( hb_cached_of_kerner_prg( gpu_info, filename ) == 1 )
        return (1);

    idx = gpu_info->file_count;

#ifdef USE_EXTERNAL_KERNEL
    status = hb_convert_to_string( filename, &source_str, gpu_info, idx );
    if( status == 0 )
        return(0);
#else
    int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter);

//    char *scale_src;
//    status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx);
//    if (status != 0)
//        kernel_src_size += strlen(scale_src);

    source_str = (char*)malloc( kernel_src_size + 2 );
    strcpy( source_str, kernel_src_scale );
//    strcat( source_str, scale_src );    //
    strcat( source_str, kernel_src_yadif_filter );
#endif

    source = source_str;
    source_size[0] = strlen( source );

    if ((binaryExisted = hb_binary_generated(gpu_info->context, filename, &fd)) == 1)
    {
        status = clGetContextInfo(gpu_info->context,
                                  CL_CONTEXT_NUM_DEVICES,
                                  sizeof(numDevices),
                                  &numDevices,
                                  NULL);
        if (status != CL_SUCCESS)
        {
            hb_log("OpenCL: Unable to get the number of devices in context.");
            return 0;
        }

        devices = (cl_device_id*)malloc(sizeof(cl_device_id) * numDevices);
        if (devices == NULL)
            return 0;

        length   = 0;
        b_error  = 0;
        b_error |= fseek(fd, 0, SEEK_END) <  0;
        b_error |= (length = ftell(fd))   <= 0;
        b_error |= fseek(fd, 0, SEEK_SET) <  0;
        if (b_error)
            return 0;

        binary = (char*)calloc(length + 2, sizeof(char));
        if (binary == NULL)
            return 0;

        b_error |= fread(binary, 1, length, fd) != length;
#if 0   // this doesn't work under OS X and/or with some non-AMD GPUs
        if (binary[length-1] != '\n')
            binary[length++]  = '\n';
#endif

        if (b_error)
            return 0;

        /* grab the handles to all of the devices in the context. */
        status = clGetContextInfo(gpu_info->context,
                                  CL_CONTEXT_DEVICES,
                                  sizeof(cl_device_id) * numDevices,
                                  devices,
                                  NULL);

        gpu_info->programs[idx] = clCreateProgramWithBinary(gpu_info->context,
                                                            numDevices,
                                                            devices,
                                                            &length,
                                                            (const unsigned char**)&binary,
                                                            &binary_status,
                                                            &status);

        fclose(fd);
        free(devices);
        fd      = NULL;
        devices = NULL;
    }
    else
    {
        /* create a CL program using the kernel source */
        gpu_info->programs[idx] = clCreateProgramWithSource(
            gpu_info->context, 1, &source, source_size, &status );
    }

    if((gpu_info->programs[idx] == (cl_program)NULL) || (status != CL_SUCCESS)){
        hb_log( "OpenCL: Unable to get list of devices in context." );
        return(0);
    }

    /* create a cl program executable for all the devices specified */
    if( !gpu_info->isUserCreated ) 
    {
        status = clBuildProgram( gpu_info->programs[idx], 1, gpu_info->devices,
                                 build_option, NULL, NULL );
    }
    else
    {
        status = clBuildProgram( gpu_info->programs[idx], 1, &(gpu_info->dev),
                                 build_option, NULL, NULL );
    }

    if( status != CL_SUCCESS )
    {
        if( !gpu_info->isUserCreated ) 
        {
            status = clGetProgramBuildInfo( gpu_info->programs[idx],
                                            gpu_info->devices[0],
                                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
        }
        else
        {
            status = clGetProgramBuildInfo( gpu_info->programs[idx],
                                            gpu_info->dev,
                                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
        }

        if( status != CL_SUCCESS )
        {
            hb_log( "OpenCL: Unable to get GPU build information." );
            return(0);
        }

        buildLog = (char*)malloc( length );
        if( buildLog == (char*)NULL )
        {
            return(0);
        }

        if( !gpu_info->isUserCreated )
        {
            status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->devices[0],
                                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
        }
        else
        {
            status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->dev,
                                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
        }

        fd1 = fopen( "kernel-build.log", "w+" );
        if( fd1 != NULL ) {
            fwrite( buildLog, sizeof(char), length, fd1 );
            fclose( fd1 );
        }

        free( buildLog );
        return(0);
    }

    strcpy( gpu_env.kernelSrcFile[idx], filename );

    if (binaryExisted != 1)
    {
        //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);
    }

    gpu_info->file_count += 1;

    return(1);
}


/**
 * hb_get_kernel_env_and_func
 * @param kernel_name -
 * @param env -
 * @param function -
 */
int hb_get_kernel_env_and_func( const char *kernel_name,
                                KernelEnv *env,
                                cl_kernel_function *function )
{
    int i;
    for( i = 0; i < gpu_env.kernel_count; i++ )
    {
        if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )
        {
            env->context = gpu_env.context;
            env->command_queue = gpu_env.command_queue;
            env->program = gpu_env.programs[0];
            env->kernel = gpu_env.kernels[i];
            env->isAMD = ( gpu_env.vendor == AMD ) ? 1 : 0;
            *function = gpu_env.kernel_functions[i];
            return(1);
        }
    }
    return(0);
}

/**
 * hb_get_kernel_env_and_func
 * @param kernel_name -
 * @param userdata -
 */
int hb_run_kernel( const char *kernel_name, void **userdata )
{
    KernelEnv env;
    cl_kernel_function function;
    int status;
    memset( &env, 0, sizeof(KernelEnv));
    status = hb_get_kernel_env_and_func( kernel_name, &env, &function );
    strcpy( env.kernel_name, kernel_name );
    if( status == 1 ) 
    {
        return(function( userdata, &env ));
    }

    return(0);
}

/**
 * hb_init_opencl_run_env
 * @param argc -
 * @param argv -
 * @param build_option -
 */
int hb_init_opencl_run_env( int argc, char **argv, const char *build_option )
{
    int status = 0;
    if( MAX_CLKERNEL_NUM <= 0 )
    {
        return 1;
    }

    if((argc > MAX_CLFILE_NUM) || (argc<0))
    {
        return 1;
    }

    if( !isInited )
    {
        hb_regist_opencl_kernel();

        /*initialize devices, context, comand_queue*/
        status = hb_init_opencl_env( &gpu_env );
        if( status )
            return(1);

        /*initialize program, kernel_name, kernel_count*/
        status = hb_compile_kernel_file("hb-opencl-kernels.cl",
                                        &gpu_env, 0, build_option);

        if( status == 0 || gpu_env.kernel_count == 0 )
        {
            return(1);

        }

        useBuffers = 1;
        isInited = 1;
    }

    return(0);
}

/**
 * hb_release_opencl_run_env
 */
int hb_release_opencl_run_env()
{
    return hb_release_opencl_env( &gpu_env );
}

/**
 * hb_opencl_stats
 */
int hb_opencl_stats()
{
    return isInited;
}

/**
 * hb_get_opencl_env
 */
int hb_get_opencl_env()
{
    int i = 0;
    cl_int status;
    cl_uint numDevices;
    cl_device_id *devices;

    /*initialize devices, context, comand_queue*/
    status = hb_init_opencl_env( &gpu_env );
    if( status )
        return(1);
    status = clGetContextInfo( gpu_env.context,
                               CL_CONTEXT_NUM_DEVICES,
                               sizeof(numDevices),
                               &numDevices,
                               NULL );
    if( status != CL_SUCCESS )
        return 0;

    devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );
    if( devices == NULL )
        return 0;

    /* grab the handles to all of the devices in the context. */
    status = clGetContextInfo( gpu_env.context,
                               CL_CONTEXT_DEVICES,
                               sizeof(cl_device_id) * numDevices,
                               devices,
                               NULL );

    for (i = 0; i < numDevices; i++)
    {
        if (devices[i] != NULL)
        {
            char deviceVendor[100], deviceName[1024], driverVersion[1024];
            clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(deviceVendor),
                            deviceVendor, NULL);
            clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName),
                            deviceName, NULL);
            clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(driverVersion),
                            driverVersion, NULL);
            hb_log("hb_get_opencl_env: GPU #%d, Device Vendor:  %s", i + 1, deviceVendor);
            hb_log("hb_get_opencl_env: GPU #%d, Device Name:    %s", i + 1, deviceName);
            hb_log("hb_get_opencl_env: GPU #%d, Driver Version: %s", i + 1, driverVersion);
        }
    }

    if( devices != NULL )
    {
        free( devices );
        devices = NULL;
    }

    return status;
}

/**
 * hb_create_buffer
 * @param cl_inBuf -
 * @param flags -
 * @param size -
 */
int hb_create_buffer( cl_mem *cl_Buf, int flags, int size )
{
    int status;
    *cl_Buf = clCreateBuffer( gpu_env.context, (flags), (size), NULL, &status );
    
    if( status != CL_SUCCESS )
    { 
        hb_log( "OpenCL: clCreateBuffer error '%d'", status );
        return 0; 
    }

    return 1;
}


/**
 * hb_read_opencl_buffer
 * @param cl_inBuf -
 * @param outbuf -
 * @param size -
 */
int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size )
{
    int status;

    status = clEnqueueReadBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE, 0, size, outbuf, 0, 0, 0 );
    if( status != CL_SUCCESS )
    { 
        hb_log( "OpenCL: av_read_opencl_buffer error '%d'", status );
        return 0; 
    }

    return 1;
}

int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size)
{
    int status;
    int flags = CL_MEM_ALLOC_HOST_PTR;
    //cl_event event;
    *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status);
    *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status);

    //hb_log("\t **** context: %.8x   cmdqueue: %.8x   cl_mem: %.8x   mapaddr: %.8x   size: %d    status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status);

    return (status == CL_SUCCESS) ? 1 : 0;
}

int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr)
{
    cl_event event;
    int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event);
    if (status == CL_SUCCESS)
        clWaitForEvents(1, &event);
    else
        hb_log("hb_free_mapped_buffer: error %d", status);
    return (status == CL_SUCCESS) ? 1 : 0;
}

void hb_opencl_init()
{
    hb_get_opencl_env();
}

int hb_use_buffers()
{
    return useBuffers;
}

int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)
{
    int status = clEnqueueCopyBuffer(gpu_env.command_queue,
        src_buffer,
        dst_buffer,
        src_offset, dst_offset, cb,
        0, 0, 0);
    if( status != CL_SUCCESS )
    { 
        av_log(NULL,AV_LOG_ERROR, "hb_read_opencl_buffer error '%d'\n", status ); 
        return 0; 
    }
    return 1;
}

int hb_read_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height)
{

    int chrH = -(-height >> 1);
        unsigned char *temp = (unsigned char *)av_malloc(sizeof(uint8_t) * (linesize0 * height + linesize1 * chrH * 2));
    if(hb_read_opencl_buffer(cl_inBuf,temp,sizeof(uint8_t)*(linesize0 + linesize1)*height))
    {
        memcpy(Ybuf,temp,linesize0 * height);
        memcpy(Ubuf,temp + linesize0 * height,linesize1 *chrH);
        memcpy(Vbuf,temp + linesize0 * height + linesize1 * chrH,linesize2 * chrH);
        
    }
    av_free(temp);

    return 1;
}

int hb_write_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height,int offset)
{
    int status;
    void *mapped = clEnqueueMapBuffer( gpu_env.command_queue, cl_inBuf,  CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * (linesize0  + linesize1)*height + offset, 0, NULL, NULL, NULL );
    uint8_t *temp = (uint8_t *)mapped;
    temp += offset;
    memcpy(temp,Ybuf,sizeof(uint8_t) * linesize0 * height);
    memcpy(temp + sizeof(uint8_t) * linesize0 * height,Ubuf,sizeof(uint8_t) * linesize1 * height/2);
    memcpy(temp + sizeof(uint8_t) * (linesize0 * height + linesize1 * height/2),Vbuf,sizeof(uint8_t) * linesize2 * height/2);
    clEnqueueUnmapMemObject(gpu_env.command_queue, cl_inBuf, mapped, 0, NULL, NULL );
    return 1;
}

cl_command_queue hb_get_command_queue()
{
    return gpu_env.command_queue;
}

cl_context hb_get_context()
{
    return gpu_env.context;
}
#endif