diff options
-rw-r--r-- | contrib/x264/A01-clang-avx.patch | 594 |
1 files changed, 594 insertions, 0 deletions
diff --git a/contrib/x264/A01-clang-avx.patch b/contrib/x264/A01-clang-avx.patch new file mode 100644 index 000000000..2fd23f3b4 --- /dev/null +++ b/contrib/x264/A01-clang-avx.patch @@ -0,0 +1,594 @@ +From 7737e6ad4acf1058aeb0f9802e2a3ca1e0a30d29 Mon Sep 17 00:00:00 2001 +From: Henrik Gramner <[email protected]> +Date: Sat, 2 Jun 2018 20:35:10 +0200 +Subject: [PATCH 1/1] Fix clang stack alignment issues + +Clang emits aligned AVX stores for things like zeroing stack-allocated +variables when using -mavx even with -fno-tree-vectorize set which can +result in crashes if this occurs before we've realigned the stack. + +Previously we only ensured that the stack was realigned before calling +assembly functions that accesses stack-allocated buffers but this is +not sufficient. Fix the issue by changing the stack realignment to +instead occur immediately in all CLI, API and thread entry points. +--- + common/base.c | 60 +++++++++++++++++++++++++++++++------- + common/threadpool.c | 9 ++++-- + common/x86/cpu-a.asm | 80 ++++++++++++++++++++++++++++----------------------- + encoder/api.c | 29 +++++++++++-------- + encoder/encoder.c | 8 +++--- + encoder/lookahead.c | 15 ++++++---- + encoder/ratecontrol.c | 2 +- + tools/checkasm.c | 6 +++- + x264.c | 7 ++++- + 9 files changed, 144 insertions(+), 72 deletions(-) + +diff --git a/common/base.c b/common/base.c +index a07d9c6b..3befe73d 100644 +--- a/common/base.c ++++ b/common/base.c +@@ -196,7 +196,7 @@ error: + /**************************************************************************** + * x264_picture_init: + ****************************************************************************/ +-void x264_picture_init( x264_picture_t *pic ) ++static void picture_init( x264_picture_t *pic ) + { + memset( pic, 0, sizeof( x264_picture_t ) ); + pic->i_type = X264_TYPE_AUTO; +@@ -204,10 +204,15 @@ void x264_picture_init( x264_picture_t *pic ) + pic->i_pic_struct = PIC_STRUCT_AUTO; + } + ++void x264_picture_init( x264_picture_t *pic ) ++{ ++ x264_stack_align( picture_init, pic ); ++} ++ + /**************************************************************************** + * x264_picture_alloc: + ****************************************************************************/ +-int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) ++static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) + { + typedef struct + { +@@ -237,7 +242,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh + int csp = i_csp & X264_CSP_MASK; + if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 ) + return -1; +- x264_picture_init( pic ); ++ picture_init( pic ); + pic->img.i_csp = i_csp; + pic->img.i_plane = csp_tab[csp].planes; + int depth_factor = i_csp & X264_CSP_HIGH_DEPTH ? 2 : 1; +@@ -259,10 +264,15 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh + return 0; + } + ++int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) ++{ ++ return x264_stack_align( picture_alloc, pic, i_csp, i_width, i_height ); ++} ++ + /**************************************************************************** + * x264_picture_clean: + ****************************************************************************/ +-void x264_picture_clean( x264_picture_t *pic ) ++static void picture_clean( x264_picture_t *pic ) + { + x264_free( pic->img.plane[0] ); + +@@ -270,10 +280,15 @@ void x264_picture_clean( x264_picture_t *pic ) + memset( pic, 0, sizeof( x264_picture_t ) ); + } + ++void x264_picture_clean( x264_picture_t *pic ) ++{ ++ x264_stack_align( picture_clean, pic ); ++} ++ + /**************************************************************************** + * x264_param_default: + ****************************************************************************/ +-void x264_param_default( x264_param_t *param ) ++static void param_default( x264_param_t *param ) + { + /* */ + memset( param, 0, sizeof( x264_param_t ) ); +@@ -416,6 +431,11 @@ void x264_param_default( x264_param_t *param ) + param->psz_clbin_file = NULL; + } + ++void x264_param_default( x264_param_t *param ) ++{ ++ x264_stack_align( param_default, param ); ++} ++ + static int param_apply_preset( x264_param_t *param, const char *preset ) + { + char *end; +@@ -643,9 +663,9 @@ static int param_apply_tune( x264_param_t *param, const char *tune ) + return 0; + } + +-int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune ) ++static int param_default_preset( x264_param_t *param, const char *preset, const char *tune ) + { +- x264_param_default( param ); ++ param_default( param ); + + if( preset && param_apply_preset( param, preset ) < 0 ) + return -1; +@@ -654,7 +674,12 @@ int x264_param_default_preset( x264_param_t *param, const char *preset, const ch + return 0; + } + +-void x264_param_apply_fastfirstpass( x264_param_t *param ) ++int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune ) ++{ ++ return x264_stack_align( param_default_preset, param, preset, tune ); ++} ++ ++static void param_apply_fastfirstpass( x264_param_t *param ) + { + /* Set faster options in case of turbo firstpass. */ + if( param->rc.b_stat_write && !param->rc.b_stat_read ) +@@ -669,6 +694,11 @@ void x264_param_apply_fastfirstpass( x264_param_t *param ) + } + } + ++void x264_param_apply_fastfirstpass( x264_param_t *param ) ++{ ++ x264_stack_align( param_apply_fastfirstpass, param ); ++} ++ + static int profile_string_to_int( const char *str ) + { + if( !strcasecmp( str, "baseline" ) ) +@@ -686,7 +716,7 @@ static int profile_string_to_int( const char *str ) + return -1; + } + +-int x264_param_apply_profile( x264_param_t *param, const char *profile ) ++static int param_apply_profile( x264_param_t *param, const char *profile ) + { + if( !profile ) + return 0; +@@ -748,6 +778,11 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) + return 0; + } + ++int x264_param_apply_profile( x264_param_t *param, const char *profile ) ++{ ++ return x264_stack_align( param_apply_profile, param, profile ); ++} ++ + static int parse_enum( const char *arg, const char * const *names, int *dst ) + { + for( int i = 0; names[i]; i++ ) +@@ -809,7 +844,7 @@ static double atof_internal( const char *str, int *b_error ) + #define atoi(str) atoi_internal( str, &b_error ) + #define atof(str) atof_internal( str, &b_error ) + +-int x264_param_parse( x264_param_t *p, const char *name, const char *value ) ++static int param_parse( x264_param_t *p, const char *name, const char *value ) + { + char *name_buf = NULL; + int b_error = 0; +@@ -1308,6 +1343,11 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) + return b_error ? errortype : 0; + } + ++int x264_param_parse( x264_param_t *param, const char *name, const char *value ) ++{ ++ return x264_stack_align( param_parse, param, name, value ); ++} ++ + /**************************************************************************** + * x264_param2string: + ****************************************************************************/ +diff --git a/common/threadpool.c b/common/threadpool.c +index 5a71feb1..7f98f778 100644 +--- a/common/threadpool.c ++++ b/common/threadpool.c +@@ -47,7 +47,7 @@ struct x264_threadpool_t + x264_sync_frame_list_t done; /* list of jobs that have finished processing */ + }; + +-static void *threadpool_thread( x264_threadpool_t *pool ) ++static void *threadpool_thread_internal( x264_threadpool_t *pool ) + { + if( pool->init_func ) + pool->init_func( pool->init_arg ); +@@ -66,12 +66,17 @@ static void *threadpool_thread( x264_threadpool_t *pool ) + x264_pthread_mutex_unlock( &pool->run.mutex ); + if( !job ) + continue; +- job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */ ++ job->ret = job->func( job->arg ); + x264_sync_frame_list_push( &pool->done, (void*)job ); + } + return NULL; + } + ++static void *threadpool_thread( x264_threadpool_t *pool ) ++{ ++ return (void*)x264_stack_align( threadpool_thread_internal, pool ); ++} ++ + int x264_threadpool_init( x264_threadpool_t **p_pool, int threads, + void (*init_func)(void *), void *init_arg ) + { +diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm +index ad42c26d..d94f7d54 100644 +--- a/common/x86/cpu-a.asm ++++ b/common/x86/cpu-a.asm +@@ -64,23 +64,42 @@ cglobal cpu_xgetbv + %endif + ret + ++;----------------------------------------------------------------------------- ++; void cpu_emms( void ) ++;----------------------------------------------------------------------------- ++cglobal cpu_emms ++ emms ++ ret ++ ++;----------------------------------------------------------------------------- ++; void cpu_sfence( void ) ++;----------------------------------------------------------------------------- ++cglobal cpu_sfence ++ sfence ++ ret ++ + %if ARCH_X86_64 + + ;----------------------------------------------------------------------------- +-; void stack_align( void (*func)(void*), void *arg ); ++; intptr_t stack_align( void (*func)(void*), ... ); (up to 5 args) + ;----------------------------------------------------------------------------- + cglobal stack_align +- push rbp +- mov rbp, rsp ++ mov rax, r0mp ++ mov r0, r1mp ++ mov r1, r2mp ++ mov r2, r3mp ++ mov r3, r4mp ++ mov r4, r5mp ++ push rbp ++ mov rbp, rsp ++%if WIN64 ++ sub rsp, 40 ; shadow space + r4 ++%endif ++ and rsp, ~(STACK_ALIGNMENT-1) + %if WIN64 +- sub rsp, 32 ; shadow space ++ mov [rsp+32], r4 + %endif +- and rsp, ~(STACK_ALIGNMENT-1) +- mov rax, r0 +- mov r0, r1 +- mov r1, r2 +- mov r2, r3 +- call rax ++ call rax + leave + ret + +@@ -113,33 +132,22 @@ cglobal cpu_cpuid_test + ret + + cglobal stack_align +- push ebp +- mov ebp, esp +- sub esp, 12 +- and esp, ~(STACK_ALIGNMENT-1) +- mov ecx, [ebp+8] +- mov edx, [ebp+12] +- mov [esp], edx +- mov edx, [ebp+16] +- mov [esp+4], edx +- mov edx, [ebp+20] +- mov [esp+8], edx +- call ecx ++ push ebp ++ mov ebp, esp ++ sub esp, 20 ++ and esp, ~(STACK_ALIGNMENT-1) ++ mov r0, [ebp+12] ++ mov r1, [ebp+16] ++ mov r2, [ebp+20] ++ mov [esp+ 0], r0 ++ mov [esp+ 4], r1 ++ mov [esp+ 8], r2 ++ mov r0, [ebp+24] ++ mov r1, [ebp+28] ++ mov [esp+12], r0 ++ mov [esp+16], r1 ++ call [ebp+ 8] + leave + ret + + %endif +- +-;----------------------------------------------------------------------------- +-; void cpu_emms( void ) +-;----------------------------------------------------------------------------- +-cglobal cpu_emms +- emms +- ret +- +-;----------------------------------------------------------------------------- +-; void cpu_sfence( void ) +-;----------------------------------------------------------------------------- +-cglobal cpu_sfence +- sfence +- ret +diff --git a/encoder/api.c b/encoder/api.c +index e247f3e4..b97612b7 100644 +--- a/encoder/api.c ++++ b/encoder/api.c +@@ -73,7 +73,7 @@ typedef struct x264_api_t + int (*encoder_invalidate_reference)( x264_t *, int64_t pts ); + } x264_api_t; + +-x264_t *x264_encoder_open( x264_param_t *param ) ++static x264_api_t *encoder_open( x264_param_t *param ) + { + x264_api_t *api = calloc( 1, sizeof( x264_api_t ) ); + if( !api ) +@@ -118,15 +118,20 @@ x264_t *x264_encoder_open( x264_param_t *param ) + return NULL; + } + ++ return api; ++} ++ ++x264_t *x264_encoder_open( x264_param_t *param ) ++{ + /* x264_t is opaque */ +- return (x264_t *)api; ++ return (x264_t *)x264_stack_align( encoder_open, param ); + } + + void x264_encoder_close( x264_t *h ) + { + x264_api_t *api = (x264_api_t *)h; + +- api->encoder_close( api->x264 ); ++ x264_stack_align( api->encoder_close, api->x264 ); + free( api ); + } + +@@ -134,61 +139,61 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) + { + x264_api_t *api = (x264_api_t *)h; + +- api->nal_encode( api->x264, dst, nal ); ++ x264_stack_align( api->nal_encode, api->x264, dst, nal ); + } + + int x264_encoder_reconfig( x264_t *h, x264_param_t *param) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_reconfig( api->x264, param ); ++ return x264_stack_align( api->encoder_reconfig, api->x264, param ); + } + + void x264_encoder_parameters( x264_t *h, x264_param_t *param ) + { + x264_api_t *api = (x264_api_t *)h; + +- api->encoder_parameters( api->x264, param ); ++ x264_stack_align( api->encoder_parameters, api->x264, param ); + } + + int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_headers( api->x264, pp_nal, pi_nal ); ++ return x264_stack_align( api->encoder_headers, api->x264, pp_nal, pi_nal ); + } + + int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_encode( api->x264, pp_nal, pi_nal, pic_in, pic_out ); ++ return x264_stack_align( api->encoder_encode, api->x264, pp_nal, pi_nal, pic_in, pic_out ); + } + + int x264_encoder_delayed_frames( x264_t *h ) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_delayed_frames( api->x264 ); ++ return x264_stack_align( api->encoder_delayed_frames, api->x264 ); + } + + int x264_encoder_maximum_delayed_frames( x264_t *h ) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_maximum_delayed_frames( api->x264 ); ++ return x264_stack_align( api->encoder_maximum_delayed_frames, api->x264 ); + } + + void x264_encoder_intra_refresh( x264_t *h ) + { + x264_api_t *api = (x264_api_t *)h; + +- api->encoder_intra_refresh( api->x264 ); ++ x264_stack_align( api->encoder_intra_refresh, api->x264 ); + } + + int x264_encoder_invalidate_reference( x264_t *h, int64_t pts ) + { + x264_api_t *api = (x264_api_t *)h; + +- return api->encoder_invalidate_reference( api->x264, pts ); ++ return x264_stack_align( api->encoder_invalidate_reference, api->x264, pts ); + } +diff --git a/encoder/encoder.c b/encoder/encoder.c +index 243a87a5..286b112b 100644 +--- a/encoder/encoder.c ++++ b/encoder/encoder.c +@@ -1564,7 +1564,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) + if( h->param.b_cabac ) + x264_cabac_init( h ); + else +- x264_stack_align( x264_cavlc_init, h ); ++ x264_cavlc_init( h ); + + mbcmp_init( h ); + chroma_dsp_init( h ); +@@ -3087,7 +3087,7 @@ static void *slices_write( x264_t *h ) + } + } + h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb ); +- if( x264_stack_align( slice_write, h ) ) ++ if( slice_write( h ) ) + goto fail; + h->sh.i_first_mb = h->sh.i_last_mb + 1; + // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order +@@ -3122,7 +3122,7 @@ static int threaded_slices_write( x264_t *h ) + t->sh.i_last_mb = t->i_threadslice_end * h->mb.i_mb_width - 1; + } + +- x264_stack_align( x264_analyse_weight_frame, h, h->mb.i_mb_height*16 + 16 ); ++ x264_analyse_weight_frame( h, h->mb.i_mb_height*16 + 16 ); + + x264_threads_distribute_ratecontrol( h ); + +@@ -3300,7 +3300,7 @@ int x264_encoder_encode( x264_t *h, + return -1; + } + else +- x264_stack_align( x264_adaptive_quant_frame, h, fenc, pic_in->prop.quant_offsets ); ++ x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets ); + + if( pic_in->prop.quant_offsets_free ) + pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets ); +diff --git a/encoder/lookahead.c b/encoder/lookahead.c +index da8e6c2e..5c948cfb 100644 +--- a/encoder/lookahead.c ++++ b/encoder/lookahead.c +@@ -67,7 +67,7 @@ static void lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb ) + #if HAVE_THREAD + static void lookahead_slicetype_decide( x264_t *h ) + { +- x264_stack_align( x264_slicetype_decide, h ); ++ x264_slicetype_decide( h ); + + lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; +@@ -82,12 +82,12 @@ static void lookahead_slicetype_decide( x264_t *h ) + + /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ + if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) +- x264_stack_align( x264_slicetype_analyse, h, shift_frames ); ++ x264_slicetype_analyse( h, shift_frames ); + + x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); + } + +-static void *lookahead_thread( x264_t *h ) ++static void *lookahead_thread_internal( x264_t *h ) + { + while( !h->lookahead->b_exit_thread ) + { +@@ -121,6 +121,11 @@ static void *lookahead_thread( x264_t *h ) + x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); + return NULL; + } ++ ++static void *lookahead_thread( x264_t *h ) ++{ ++ return (void*)x264_stack_align( lookahead_thread_internal, h ); ++} + #endif + + int x264_lookahead_init( x264_t *h, int i_slicetype_length ) +@@ -230,14 +235,14 @@ void x264_lookahead_get_frames( x264_t *h ) + if( h->frames.current[0] || !h->lookahead->next.i_size ) + return; + +- x264_stack_align( x264_slicetype_decide, h ); ++ x264_slicetype_decide( h ); + lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); + int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; + lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); + + /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ + if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) +- x264_stack_align( x264_slicetype_analyse, h, shift_frames ); ++ x264_slicetype_analyse( h, shift_frames ); + + lookahead_encoder_shift( h ); + } +diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c +index 85548f0b..b7f0ee07 100644 +--- a/encoder/ratecontrol.c ++++ b/encoder/ratecontrol.c +@@ -574,7 +574,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offs + rc->mbtree.qpbuf_pos--; + } + else +- x264_stack_align( x264_adaptive_quant_frame, h, frame, quant_offsets ); ++ x264_adaptive_quant_frame( h, frame, quant_offsets ); + return 0; + fail: + x264_log( h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n" ); +diff --git a/tools/checkasm.c b/tools/checkasm.c +index 440e1d23..5f1e275f 100644 +--- a/tools/checkasm.c ++++ b/tools/checkasm.c +@@ -2913,7 +2913,7 @@ static int check_all_flags( void ) + return ret; + } + +-int main(int argc, char *argv[]) ++static int main_internal( int argc, char **argv ) + { + #ifdef _WIN32 + /* Disable the Windows Error Reporting dialog */ +@@ -2973,3 +2973,7 @@ int main(int argc, char *argv[]) + return 0; + } + ++int main( int argc, char **argv ) ++{ ++ return x264_stack_align( main_internal, argc, argv ); ++} +diff --git a/x264.c b/x264.c +index b02ba49a..83bc9660 100644 +--- a/x264.c ++++ b/x264.c +@@ -351,7 +351,7 @@ static void print_version_info( void ) + #endif + } + +-int main( int argc, char **argv ) ++static int main_internal( int argc, char **argv ) + { + x264_param_t param; + cli_opt_t opt = {0}; +@@ -403,6 +403,11 @@ int main( int argc, char **argv ) + return ret; + } + ++int main( int argc, char **argv ) ++{ ++ return x264_stack_align( main_internal, argc, argv ); ++} ++ + static char const *strtable_lookup( const char * const table[], int idx ) + { + int i = 0; while( table[i] ) i++; +-- +2.11.0 + |