diff options
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.c | 66 | ||||
-rw-r--r-- | src/gallium/auxiliary/tgsi/tgsi_exec.c | 30 | ||||
-rw-r--r-- | src/gallium/auxiliary/tgsi/tgsi_sse2.c | 19 |
3 files changed, 113 insertions, 2 deletions
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 441877d46f0..41bdd012d56 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -31,6 +31,7 @@ #include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_exec.h" @@ -43,6 +44,7 @@ #ifdef PIPE_ARCH_X86 #define DISASSEM 0 +#define FAST_MATH 1 static const char *files[] = { @@ -1380,14 +1382,28 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } + + /* A wrapper for powf(). * Makes sure it is cdecl and operates on floats. */ static float PIPE_CDECL _powerf( float x, float y ) { +#if FAST_MATH + return util_fast_pow(x, y); +#else return powf( x, y ); +#endif } +#if FAST_MATH +static float PIPE_CDECL _exp2(float x) +{ + return util_fast_exp2(x); +} +#endif + + /* Really not sufficient -- need to check for conditions that could * generate inf/nan values, which will slow things down hugely. */ @@ -1442,6 +1458,48 @@ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_inst } +#if FAST_MATH +static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + uint i; + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); + + x87_fld_src( cp, &op->FullSrcRegisters[0], 0 ); + x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); + + /* tmp_EAX has been pushed & will be restored below */ + x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); + x86_call( cp->func, cp->tmp_EAX ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); + + x86_cdecl_caller_pop_regs( cp->func ); + + /* Note retval on x87 stack: + */ + cp->func->x87_stack++; + + x87_fstp_dest4( cp, &op->FullDstRegisters[0] ); + + return TRUE; +} +#endif + + static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); @@ -1662,7 +1720,9 @@ emit_instruction( struct aos_compilation *cp, return emit_RND(cp, inst); case TGSI_OPCODE_EXPBASE2: -#if 0 +#if FAST_MATH + return emit_EXPBASE2(cp, inst); +#elif 0 /* this seems to fail for "larger" exponents. * See glean tvertProg1's EX2 test. */ @@ -1827,6 +1887,8 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, struct aos_compilation cp; unsigned fixup, label; + util_init_math(); + tgsi_parse_init( &parse, varient->base.vs->state.tokens ); memset(&cp, 0, sizeof(cp)); @@ -2135,4 +2197,4 @@ struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, -#endif +#endif /* PIPE_ARCH_X86 */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 88a34a69613..e28b56c842f 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -57,6 +57,9 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" +#include "util/u_math.h" + +#define FAST_MATH 1 #define TILE_TOP_LEFT 0 #define TILE_TOP_RIGHT 1 @@ -145,6 +148,8 @@ tgsi_exec_machine_bind_shader( tgsi_dump(tokens, 0); #endif + util_init_math(); + mach->Tokens = tokens; mach->Samplers = samplers; @@ -448,10 +453,17 @@ micro_exp2( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src) { +#if FAST_MATH + dst->f[0] = util_fast_exp2( src->f[0] ); + dst->f[1] = util_fast_exp2( src->f[1] ); + dst->f[2] = util_fast_exp2( src->f[2] ); + dst->f[3] = util_fast_exp2( src->f[3] ); +#else dst->f[0] = powf( 2.0f, src->f[0] ); dst->f[1] = powf( 2.0f, src->f[1] ); dst->f[2] = powf( 2.0f, src->f[2] ); dst->f[3] = powf( 2.0f, src->f[3] ); +#endif } static void @@ -528,10 +540,17 @@ micro_lg2( union tgsi_exec_channel *dst, const union tgsi_exec_channel *src ) { +#if FAST_MATH + dst->f[0] = util_fast_log2( src->f[0] ); + dst->f[1] = util_fast_log2( src->f[1] ); + dst->f[2] = util_fast_log2( src->f[2] ); + dst->f[3] = util_fast_log2( src->f[3] ); +#else dst->f[0] = logf( src->f[0] ) * 1.442695f; dst->f[1] = logf( src->f[1] ) * 1.442695f; dst->f[2] = logf( src->f[2] ) * 1.442695f; dst->f[3] = logf( src->f[3] ) * 1.442695f; +#endif } static void @@ -796,10 +815,17 @@ micro_pow( const union tgsi_exec_channel *src0, const union tgsi_exec_channel *src1 ) { +#if FAST_MATH + dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] ); + dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] ); + dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] ); + dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] ); +#else dst->f[0] = powf( src0->f[0], src1->f[0] ); dst->f[1] = powf( src0->f[1], src1->f[1] ); dst->f[2] = powf( src0->f[2], src1->f[2] ); dst->f[3] = powf( src0->f[3], src1->f[3] ); +#endif } static void @@ -2024,7 +2050,11 @@ exec_instruction( /* TGSI_OPCODE_EX2 */ FETCH(&r[0], 0, CHAN_X); +#if FAST_MATH + micro_exp2( &r[0], &r[0] ); +#else micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] ); +#endif FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 485e5a0e6f5..e3906070237 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -27,6 +27,7 @@ #include "pipe/p_util.h" #include "pipe/p_shader_tokens.h" +#include "util/u_math.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -42,6 +43,8 @@ */ #define HIGH_PRECISION 1 +#define FAST_MATH 1 + #define FOR_EACH_CHANNEL( CHAN )\ for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++) @@ -623,10 +626,17 @@ ex24f( { const unsigned X = 0; +#if FAST_MATH + store[X + 0] = util_fast_exp2( store[X + 0] ); + store[X + 1] = util_fast_exp2( store[X + 1] ); + store[X + 2] = util_fast_exp2( store[X + 2] ); + store[X + 3] = util_fast_exp2( store[X + 3] ); +#else store[X + 0] = powf( 2.0f, store[X + 0] ); store[X + 1] = powf( 2.0f, store[X + 1] ); store[X + 2] = powf( 2.0f, store[X + 2] ); store[X + 3] = powf( 2.0f, store[X + 3] ); +#endif } static void @@ -762,10 +772,17 @@ pow4f( { const unsigned X = 0; +#if FAST_MATH + store[X + 0] = util_fast_pow( store[X + 0], store[X + 4] ); + store[X + 1] = util_fast_pow( store[X + 1], store[X + 5] ); + store[X + 2] = util_fast_pow( store[X + 2], store[X + 6] ); + store[X + 3] = util_fast_pow( store[X + 3], store[X + 7] ); +#else store[X + 0] = powf( store[X + 0], store[X + 4] ); store[X + 1] = powf( store[X + 1], store[X + 5] ); store[X + 2] = powf( store[X + 2], store[X + 6] ); store[X + 3] = powf( store[X + 3], store[X + 7] ); +#endif } static void @@ -2235,6 +2252,8 @@ tgsi_emit_sse2( unsigned ok = 1; uint num_immediates = 0; + util_init_math(); + func->csr = func->store; tgsi_parse_init( &parse, tokens ); |