From 5dc8e67078be8b8c42a809311debd275ac7d64a7 Mon Sep 17 00:00:00 2001 From: José Fonseca Date: Tue, 30 Sep 2008 01:12:52 +0900 Subject: tgsi: SSE2 optimized exp2, log2 and pow implementations. Special care must be taken when calling compiler generated SSE2 functions from the runtime generated SSE2: saving the xmm registers, and notify gcc the stack is not 16byte aligned. It would be more efficient to keep the stack pointer 16byte aligned, but too hairy, and not consistent in all x86 architectures. This has been tested in linux x86 and windows x86 userspace. Not tested on x86-64 because it is broken for other reasons (even without this change). --- src/gallium/auxiliary/tgsi/tgsi_sse2.c | 287 ++++++++++++++++++++++++--------- 1 file changed, 211 insertions(+), 76 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 4681b29f52b..4fdad3a5c78 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -28,6 +28,7 @@ #include "pipe/p_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/u_sse.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -480,10 +481,31 @@ emit_coef_dady( * Function call helpers. */ +/** + * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be + * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee + * that the stack pointer is 16 byte aligned, as expected. + */ static void -emit_push_gp( - struct x86_function *func ) +emit_func_call_dst( + struct x86_function *func, + unsigned xmm_save, + unsigned xmm_dst, + void (PIPE_CDECL *code)() ) { + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + unsigned i, n, xmm; + unsigned xmm_mask; + + /* Bitmask of the xmm registers to save */ + xmm_mask = (1 << xmm_save) - 1; + xmm_mask &= ~(1 << xmm_dst); + + sse_movaps( + func, + get_temp( TEMP_R0, 0 ), + make_xmm( xmm_dst ) ); + x86_push( func, x86_make_reg( file_REG32, reg_AX) ); @@ -493,12 +515,49 @@ emit_push_gp( x86_push( func, x86_make_reg( file_REG32, reg_DX) ); -} + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) + ++n; + + x86_sub_imm( + func, + x86_make_reg( file_REG32, reg_SP ), + n*16); + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) { + sse_movups( + func, + x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ), + make_xmm( xmm ) ); + ++n; + } + + x86_lea( + func, + ecx, + get_temp( TEMP_R0, 0 ) ); + + x86_push( func, ecx ); + x86_mov_reg_imm( func, ecx, (unsigned long) code ); + x86_call( func, ecx ); + x86_pop(func, ecx ); + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) { + sse_movups( + func, + make_xmm( xmm ), + x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) ); + ++n; + } + + x86_add_imm( + func, + x86_make_reg( file_REG32, reg_SP ), + n*16); -static void -x86_pop_gp( - struct x86_function *func ) -{ /* Restore GP registers in a reverse order. */ x86_pop( @@ -510,39 +569,6 @@ x86_pop_gp( x86_pop( func, x86_make_reg( file_REG32, reg_AX) ); -} - -static void -emit_func_call_dst( - struct x86_function *func, - unsigned xmm_dst, - void (PIPE_CDECL *code)() ) -{ - sse_movaps( - func, - get_temp( TEMP_R0, 0 ), - make_xmm( xmm_dst ) ); - - emit_push_gp( - func ); - - { - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - - x86_lea( - func, - ecx, - get_temp( TEMP_R0, 0 ) ); - - x86_push( func, ecx ); - x86_mov_reg_imm( func, ecx, (unsigned long) code ); - x86_call( func, ecx ); - x86_pop(func, ecx ); - } - - - x86_pop_gp( - func ); sse_movaps( func, @@ -553,6 +579,7 @@ emit_func_call_dst( static void emit_func_call_dst_src( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst, unsigned xmm_src, void (PIPE_CDECL *code)() ) @@ -564,10 +591,111 @@ emit_func_call_dst_src( emit_func_call_dst( func, + xmm_save, xmm_dst, code ); } +/* + * Fast SSE2 implementation of special math functions. + */ + +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define EXP_POLY_DEGREE 3 +#define LOG_POLY_DEGREE 5 + +/** + * See http://www.devmaster.net/forums/showthread.php?p=43580 + */ +static INLINE __m128 +exp2f4(__m128 x) +{ + __m128i ipart; + __m128 fpart, expipart, expfpart; + + x = _mm_min_ps(x, _mm_set1_ps( 129.00000f)); + x = _mm_max_ps(x, _mm_set1_ps(-126.99999f)); + + /* ipart = int(x - 0.5) */ + ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); + + /* fpart = x - ipart */ + fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); + + /* expipart = (float) (1 << ipart) */ + expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); + + /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ +#if EXP_POLY_DEGREE == 5 + expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); +#elif EXP_POLY_DEGREE == 4 + expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f); +#elif EXP_POLY_DEGREE == 3 + expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f); +#elif EXP_POLY_DEGREE == 2 + expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f); +#else +#error +#endif + + return _mm_mul_ps(expipart, expfpart); +} + +/** + * See http://www.devmaster.net/forums/showthread.php?p=43580 + */ +static INLINE __m128 +log2f4(__m128 x) +{ + __m128i expmask = _mm_set1_epi32(0x7f800000); + __m128i mantmask = _mm_set1_epi32(0x007fffff); + __m128 one = _mm_set1_ps(1.0f); + + __m128i i = _mm_castps_si128(x); + + /* exp = (float) exponent(x) */ + __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127))); + + /* mant = (float) mantissa(x) */ + __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one); + + __m128 logmant; + + /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ + * These coefficients can be generate with + * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html + */ +#if LOG_POLY_DEGREE == 6 + logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f); +#elif LOG_POLY_DEGREE == 5 + logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +#elif LOG_POLY_DEGREE == 4 + logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +#elif LOG_POLY_DEGREE == 3 + logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +#else +#error +#endif + + /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ + logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one)); + + return _mm_add_ps(logmant, exp); +} + +static INLINE __m128 +powf4(__m128 x, __m128 y) +{ + return exp2f4(_mm_mul_ps(log2f4(x), y)); +} + + /** * Low-level instruction translators. */ @@ -610,38 +738,35 @@ cos4f( static void emit_cos( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, cos4f ); } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif ex24f( float *store ) { -#if FAST_MATH - store[0] = util_fast_exp2( store[0] ); - store[1] = util_fast_exp2( store[1] ); - store[2] = util_fast_exp2( store[2] ); - store[3] = util_fast_exp2( store[3] ); -#else - store[0] = powf( 2.0f, store[0] ); - store[1] = powf( 2.0f, store[1] ); - store[2] = powf( 2.0f, store[2] ); - store[3] = powf( 2.0f, store[3] ); -#endif + _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); } static void emit_ex2( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, ex24f ); } @@ -670,10 +795,12 @@ flr4f( static void emit_flr( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, flr4f ); } @@ -691,31 +818,35 @@ frc4f( static void emit_frc( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, frc4f ); } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif lg24f( float *store ) { - store[0] = util_fast_log2( store[0] ); - store[1] = util_fast_log2( store[1] ); - store[2] = util_fast_log2( store[2] ); - store[3] = util_fast_log2( store[3] ); + _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); } static void emit_lg2( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, lg24f ); } @@ -757,14 +888,14 @@ emit_neg( } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif pow4f( float *store ) { -#if FAST_MATH - store[0] = util_fast_pow( store[0], store[4] ); - store[1] = util_fast_pow( store[1], store[5] ); - store[2] = util_fast_pow( store[2], store[6] ); - store[3] = util_fast_pow( store[3], store[7] ); +#if 1 + _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); #else store[0] = powf( store[0], store[4] ); store[1] = powf( store[1], store[5] ); @@ -776,11 +907,13 @@ pow4f( static void emit_pow( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst, unsigned xmm_src ) { emit_func_call_dst_src( func, + xmm_save, xmm_dst, xmm_src, pow4f ); @@ -873,10 +1006,12 @@ sin4f( static void emit_sin (struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst) { emit_func_call_dst( func, + xmm_save, xmm_dst, sin4f ); } @@ -1296,7 +1431,7 @@ emit_instruction( get_temp( TGSI_EXEC_TEMP_MINUS_128_I, TGSI_EXEC_TEMP_MINUS_128_C ) ); - emit_pow( func, 1, 2 ); + emit_pow( func, 3, 1, 2 ); FETCH( func, *inst, 0, 0, CHAN_X ); sse_xorps( func, @@ -1342,11 +1477,11 @@ emit_instruction( if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { emit_MOV( func, 1, 0 ); - emit_flr( func, 1 ); + emit_flr( func, 2, 1 ); /* dst.x = ex2(floor(src.x)) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { emit_MOV( func, 2, 1 ); - emit_ex2( func, 2 ); + emit_ex2( func, 3, 2 ); STORE( func, *inst, 2, 0, CHAN_X ); } /* dst.y = src.x - floor(src.x) */ @@ -1358,7 +1493,7 @@ emit_instruction( } /* dst.z = ex2(src.x) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - emit_ex2( func, 0 ); + emit_ex2( func, 3, 0 ); STORE( func, *inst, 0, 0, CHAN_Z ); } } @@ -1376,21 +1511,21 @@ emit_instruction( FETCH( func, *inst, 0, 0, CHAN_X ); emit_abs( func, 0 ); emit_MOV( func, 1, 0 ); - emit_lg2( func, 1 ); + emit_lg2( func, 2, 1 ); /* dst.z = lg2(abs(src.x)) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { STORE( func, *inst, 1, 0, CHAN_Z ); } if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_flr( func, 1 ); + emit_flr( func, 2, 1 ); /* dst.x = floor(lg2(abs(src.x))) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { STORE( func, *inst, 1, 0, CHAN_X ); } /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_ex2( func, 1 ); + emit_ex2( func, 2, 1 ); emit_rcp( func, 1, 1 ); emit_mul( func, 0, 1 ); STORE( func, *inst, 0, 0, CHAN_Y ); @@ -1580,7 +1715,7 @@ emit_instruction( /* TGSI_OPCODE_FRC */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); - emit_frc( func, 0 ); + emit_frc( func, 0, 0 ); STORE( func, *inst, 0, 0, chan_index ); } break; @@ -1593,7 +1728,7 @@ emit_instruction( /* TGSI_OPCODE_FLR */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); - emit_flr( func, 0 ); + emit_flr( func, 0, 0 ); STORE( func, *inst, 0, 0, chan_index ); } break; @@ -1605,7 +1740,7 @@ emit_instruction( case TGSI_OPCODE_EXPBASE2: /* TGSI_OPCODE_EX2 */ FETCH( func, *inst, 0, 0, CHAN_X ); - emit_ex2( func, 0 ); + emit_ex2( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1614,7 +1749,7 @@ emit_instruction( case TGSI_OPCODE_LOGBASE2: /* TGSI_OPCODE_LG2 */ FETCH( func, *inst, 0, 0, CHAN_X ); - emit_lg2( func, 0 ); + emit_lg2( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1624,7 +1759,7 @@ emit_instruction( /* TGSI_OPCODE_POW */ FETCH( func, *inst, 0, 0, CHAN_X ); FETCH( func, *inst, 1, 1, CHAN_X ); - emit_pow( func, 0, 1 ); + emit_pow( func, 0, 0, 1 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1715,7 +1850,7 @@ emit_instruction( case TGSI_OPCODE_COS: FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0 ); + emit_cos( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1774,7 +1909,7 @@ emit_instruction( case TGSI_OPCODE_SIN: FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0 ); + emit_sin( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1868,12 +2003,12 @@ emit_instruction( case TGSI_OPCODE_SCS: IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0 ); + emit_cos( func, 0, 0 ); STORE( func, *inst, 0, 0, CHAN_X ); } IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0 ); + emit_sin( func, 0, 0 ); STORE( func, *inst, 0, 0, CHAN_Y ); } IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { -- cgit v1.2.3