3 files changed, 113 insertions, 2 deletions
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 441877d46f0..41bdd012d56 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -31,6 +31,7 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
@@ -43,6 +44,7 @@
 
 #ifdef PIPE_ARCH_X86
 #define DISASSEM 0
+#define FAST_MATH 1
 
 static const char *files[] =
 {
@@ -1380,14 +1382,28 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst
    return TRUE;
 }
 
+
+
 /* A wrapper for powf().
  * Makes sure it is cdecl and operates on floats.
  */
 static float PIPE_CDECL _powerf( float x, float y )
 {
+#if FAST_MATH
+   return util_fast_pow(x, y);
+#else
    return powf( x, y );
+#endif
 }
 
+#if FAST_MATH
+static float PIPE_CDECL _exp2(float x)
+{
+   return util_fast_exp2(x);
+}
+#endif
+
+
 /* Really not sufficient -- need to check for conditions that could
  * generate inf/nan values, which will slow things down hugely.
  */
@@ -1442,6 +1458,48 @@ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_inst
 }
 
 
+#if FAST_MATH
+static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   uint i;
+
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
+
+   x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
+   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
+
+   /* tmp_EAX has been pushed & will be restored below */
+   x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
+   x86_call( cp->func, cp->tmp_EAX );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
+
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   /* Note retval on x87 stack:
+    */
+   cp->func->x87_stack++;
+
+   x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
+
+   return TRUE;
+}
+#endif
+
+
 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
@@ -1662,7 +1720,9 @@ emit_instruction( struct aos_compilation *cp,
       return emit_RND(cp, inst);
 
    case TGSI_OPCODE_EXPBASE2:
-#if 0
+#if FAST_MATH
+      return emit_EXPBASE2(cp, inst);
+#elif 0
       /* this seems to fail for "larger" exponents.
        * See glean tvertProg1's EX2 test.
        */
@@ -1827,6 +1887,8 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
    struct aos_compilation cp;
    unsigned fixup, label;
 
+   util_init_math();
+
    tgsi_parse_init( &parse, varient->base.vs->state.tokens );
 
    memset(&cp, 0, sizeof(cp));
@@ -2135,4 +2197,4 @@ struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
 
 
 
-#endif
+#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 88a34a69613..e28b56c842f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -57,6 +57,9 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_math.h"
+
+#define FAST_MATH 1
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -145,6 +148,8 @@ tgsi_exec_machine_bind_shader(
    tgsi_dump(tokens, 0);
 #endif
 
+   util_init_math();
+
    mach->Tokens = tokens;
    mach->Samplers = samplers;
 
@@ -448,10 +453,17 @@ micro_exp2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src)
 {
+#if FAST_MATH
+   dst->f[0] = util_fast_exp2( src->f[0] );
+   dst->f[1] = util_fast_exp2( src->f[1] );
+   dst->f[2] = util_fast_exp2( src->f[2] );
+   dst->f[3] = util_fast_exp2( src->f[3] );
+#else
    dst->f[0] = powf( 2.0f, src->f[0] );
    dst->f[1] = powf( 2.0f, src->f[1] );
    dst->f[2] = powf( 2.0f, src->f[2] );
    dst->f[3] = powf( 2.0f, src->f[3] );
+#endif
 }
 
 static void
@@ -528,10 +540,17 @@ micro_lg2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
+#if FAST_MATH
+   dst->f[0] = util_fast_log2( src->f[0] );
+   dst->f[1] = util_fast_log2( src->f[1] );
+   dst->f[2] = util_fast_log2( src->f[2] );
+   dst->f[3] = util_fast_log2( src->f[3] );
+#else
    dst->f[0] = logf( src->f[0] ) * 1.442695f;
    dst->f[1] = logf( src->f[1] ) * 1.442695f;
    dst->f[2] = logf( src->f[2] ) * 1.442695f;
    dst->f[3] = logf( src->f[3] ) * 1.442695f;
+#endif
 }
 
 static void
@@ -796,10 +815,17 @@ micro_pow(
    const union tgsi_exec_channel *src0,
    const union tgsi_exec_channel *src1 )
 {
+#if FAST_MATH
+   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
+   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
+   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
+   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
+#else
    dst->f[0] = powf( src0->f[0], src1->f[0] );
    dst->f[1] = powf( src0->f[1], src1->f[1] );
    dst->f[2] = powf( src0->f[2], src1->f[2] );
    dst->f[3] = powf( src0->f[3], src1->f[3] );
+#endif
 }
 
 static void
@@ -2024,7 +2050,11 @@ exec_instruction(
     /* TGSI_OPCODE_EX2 */
       FETCH(&r[0], 0, CHAN_X);
 
+#if FAST_MATH
+      micro_exp2( &r[0], &r[0] );
+#else
       micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+#endif
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 485e5a0e6f5..e3906070237 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,6 +27,7 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -42,6 +43,8 @@
  */
 #define HIGH_PRECISION 1
 
+#define FAST_MATH 1
+
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
@@ -623,10 +626,17 @@ ex24f(
 {
    const unsigned X = 0;
 
+#if FAST_MATH
+   store[X + 0] = util_fast_exp2( store[X + 0] );
+   store[X + 1] = util_fast_exp2( store[X + 1] );
+   store[X + 2] = util_fast_exp2( store[X + 2] );
+   store[X + 3] = util_fast_exp2( store[X + 3] );
+#else
    store[X + 0] = powf( 2.0f, store[X + 0] );
    store[X + 1] = powf( 2.0f, store[X + 1] );
    store[X + 2] = powf( 2.0f, store[X + 2] );
    store[X + 3] = powf( 2.0f, store[X + 3] );
+#endif
 }
 
 static void
@@ -762,10 +772,17 @@ pow4f(
 {
    const unsigned X = 0;
 
+#if FAST_MATH
+   store[X + 0] = util_fast_pow( store[X + 0], store[X + 4] );
+   store[X + 1] = util_fast_pow( store[X + 1], store[X + 5] );
+   store[X + 2] = util_fast_pow( store[X + 2], store[X + 6] );
+   store[X + 3] = util_fast_pow( store[X + 3], store[X + 7] );
+#else
    store[X + 0] = powf( store[X + 0], store[X + 4] );
    store[X + 1] = powf( store[X + 1], store[X + 5] );
    store[X + 2] = powf( store[X + 2], store[X + 6] );
    store[X + 3] = powf( store[X + 3], store[X + 7] );
+#endif
 }
 
 static void
@@ -2235,6 +2252,8 @@ tgsi_emit_sse2(
    unsigned ok = 1;
    uint num_immediates = 0;
 
+   util_init_math();
+
    func->csr = func->store;
 
    tgsi_parse_init( &parse, tokens );