diff options
Diffstat (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c')
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.c | 2252 |
1 files changed, 2252 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c new file mode 100644 index 00000000000..0c693a4a65c --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -0,0 +1,2252 @@ +/* + * Mesa 3-D graphics library + * Version: 6.3 + * + * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code + * using the rtasm runtime assembler. Based on the old + * t_vb_arb_program_sse.c + */ + + +#include "util/u_memory.h" +#include "util/u_math.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/p_debug.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" + +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 +#define DISASSEM 0 +#define FAST_MATH 1 + +static const char *files[] = +{ + "NULL", + "CONST", + "IN", + "OUT", + "TEMP", + "SAMP", + "ADDR", + "IMM", + "INTERNAL", +}; + +static INLINE boolean eq( struct x86_reg a, + struct x86_reg b ) +{ + return (a.file == b.file && + a.idx == b.idx && + a.mod == b.mod && + a.disp == b.disp); +} + +struct x86_reg aos_get_x86( struct aos_compilation *cp, + unsigned which_reg, /* quick hack */ + unsigned value ) +{ + struct x86_reg reg; + + if (which_reg == 0) + reg = cp->temp_EBP; + else + reg = cp->tmp_EAX; + + if (cp->x86_reg[which_reg] != value) { + unsigned offset; + + switch (value) { + case X86_IMMEDIATES: + assert(which_reg == 0); + offset = Offset(struct aos_machine, immediates); + break; + case X86_CONSTANTS: + assert(which_reg == 1); + offset = Offset(struct aos_machine, constants); + break; + case X86_BUFFERS: + assert(which_reg == 0); + offset = Offset(struct aos_machine, buffer); + break; + default: + assert(0); + offset = 0; + } + + + x86_mov(cp->func, reg, + x86_make_disp(cp->machine_EDX, offset)); + + cp->x86_reg[which_reg] = value; + } + + return reg; +} + + +static struct x86_reg get_reg_ptr(struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + struct x86_reg ptr = cp->machine_EDX; + + switch (file) { + case TGSI_FILE_INPUT: + assert(idx < MAX_INPUTS); + return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); + + case TGSI_FILE_OUTPUT: + return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); + + case TGSI_FILE_TEMPORARY: + assert(idx < MAX_TEMPS); + return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); + + case AOS_FILE_INTERNAL: + assert(idx < MAX_INTERNALS); + return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); + + case TGSI_FILE_IMMEDIATE: + assert(idx < MAX_IMMEDIATES); /* just a sanity check */ + return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float)); + + case TGSI_FILE_CONSTANT: + assert(idx < MAX_CONSTANTS); /* just a sanity check */ + return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float)); + + default: + ERROR(cp, "unknown reg file"); + return x86_make_reg(0,0); + } +} + + + +#define X87_CW_EXCEPTION_INV_OP (1<<0) +#define X87_CW_EXCEPTION_DENORM_OP (1<<1) +#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) +#define X87_CW_EXCEPTION_OVERFLOW (1<<3) +#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) +#define X87_CW_EXCEPTION_PRECISION (1<<5) +#define X87_CW_PRECISION_SINGLE (0<<8) +#define X87_CW_PRECISION_RESERVED (1<<8) +#define X87_CW_PRECISION_DOUBLE (2<<8) +#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) +#define X87_CW_PRECISION_MASK (3<<8) +#define X87_CW_ROUND_NEAREST (0<<10) +#define X87_CW_ROUND_DOWN (1<<10) +#define X87_CW_ROUND_UP (2<<10) +#define X87_CW_ROUND_ZERO (3<<10) +#define X87_CW_ROUND_MASK (3<<10) +#define X87_CW_INFINITY (1<<12) + + + + +static void spill( struct aos_compilation *cp, unsigned idx ) +{ + if (!cp->xmm[idx].dirty || + (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ + cp->xmm[idx].file != TGSI_FILE_OUTPUT && + cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { + ERROR(cp, "invalid spill"); + return; + } + else { + struct x86_reg oldval = get_reg_ptr(cp, + cp->xmm[idx].file, + cp->xmm[idx].idx); + + if (0) debug_printf("\nspill %s[%d]", + files[cp->xmm[idx].file], + cp->xmm[idx].idx); + + assert(cp->xmm[idx].dirty); + sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); + cp->xmm[idx].dirty = 0; + } +} + + +void aos_spill_all( struct aos_compilation *cp ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +} + + +static struct x86_reg get_xmm_writable( struct aos_compilation *cp, + struct x86_reg reg ) +{ + if (reg.file != file_XMM || + cp->xmm[reg.idx].file != TGSI_FILE_NULL) + { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movaps(cp->func, tmp, reg); + reg = tmp; + } + + cp->xmm[reg.idx].last_used = cp->insn_counter; + return reg; +} + +static struct x86_reg get_xmm( struct aos_compilation *cp, + struct x86_reg reg ) +{ + if (reg.file != file_XMM) + { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movaps(cp->func, tmp, reg); + reg = tmp; + } + + cp->xmm[reg.idx].last_used = cp->insn_counter; + return reg; +} + + +/* Allocate an empty xmm register, either as a temporary or later to + * "adopt" as a shader reg. + */ +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) +{ + unsigned i; + unsigned oldest = 0; + boolean found = FALSE; + + for (i = 0; i < 8; i++) + if (cp->xmm[i].last_used != cp->insn_counter && + cp->xmm[i].file == TGSI_FILE_NULL) { + oldest = i; + found = TRUE; + } + + if (!found) { + for (i = 0; i < 8; i++) + if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) + oldest = i; + } + + /* Need to write out the old value? + */ + if (cp->xmm[oldest].dirty) + spill(cp, oldest); + + assert(cp->xmm[oldest].last_used != cp->insn_counter); + + cp->xmm[oldest].file = TGSI_FILE_NULL; + cp->xmm[oldest].idx = 0; + cp->xmm[oldest].dirty = 0; + cp->xmm[oldest].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, oldest); +} + +void aos_release_xmm_reg( struct aos_compilation *cp, + unsigned idx ) +{ + cp->xmm[idx].file = TGSI_FILE_NULL; + cp->xmm[idx].idx = 0; + cp->xmm[idx].dirty = 0; + cp->xmm[idx].last_used = 0; +} + + + + +/* Mark an xmm reg as holding the current copy of a shader reg. + */ +void aos_adopt_xmm_reg( struct aos_compilation *cp, + struct x86_reg reg, + unsigned file, + unsigned idx, + unsigned dirty ) +{ + unsigned i; + + if (reg.file != file_XMM) { + assert(0); + return; + } + + + /* If any xmm reg thinks it holds this shader reg, break the + * illusion. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + /* If an xmm reg is already holding this shader reg, take into account its + * dirty flag... + */ + dirty |= cp->xmm[i].dirty; + aos_release_xmm_reg(cp, i); + } + } + + cp->xmm[reg.idx].file = file; + cp->xmm[reg.idx].idx = idx; + cp->xmm[reg.idx].dirty = dirty; + cp->xmm[reg.idx].last_used = cp->insn_counter; +} + + +/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. + */ +static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + unsigned i; + + /* Ensure the in-memory copy of this reg is up-to-date + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx && + cp->xmm[i].dirty) { + spill(cp, i); + } + } + + return get_reg_ptr( cp, file, idx ); +} + + +/* As above, but return a pointer. Note - this pointer may alias + * those returned by get_arg_ptr(). + */ +static struct x86_reg get_dst_ptr( struct aos_compilation *cp, + const struct tgsi_full_dst_register *dst ) +{ + unsigned file = dst->DstRegister.File; + unsigned idx = dst->DstRegister.Index; + unsigned i; + + + /* Ensure in-memory copy of this reg is up-to-date and invalidate + * any xmm copies. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + if (cp->xmm[i].dirty) + spill(cp, i); + + aos_release_xmm_reg(cp, i); + } + } + + return get_reg_ptr( cp, file, idx ); +} + + + + + +/* Return an XMM reg if the argument is resident, otherwise return a + * base+offset pointer to the saved value. + */ +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + cp->xmm[i].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, i); + } + } + + /* If not found in the XMM register file, return an indirect + * reference to the in-memory copy: + */ + return get_reg_ptr( cp, file, idx ); +} + + + +static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + struct x86_reg reg = get_xmm( cp, + aos_get_shader_reg( cp, file, idx ) ); + + aos_adopt_xmm_reg( cp, + reg, + file, + idx, + FALSE ); + + return reg; +} + + + +struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, + unsigned imm ) +{ + return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); +} + + +struct x86_reg aos_get_internal( struct aos_compilation *cp, + unsigned imm ) +{ + return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); +} + + + + + +/* Emulate pshufd insn in regular SSE, if necessary: + */ +static void emit_pshufd( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + ubyte shuf ) +{ + if (cp->have_sse2) { + sse2_pshufd(cp->func, dst, arg0, shuf); + } + else { + if (!eq(dst, arg0)) + sse_movaps(cp->func, dst, arg0); + + sse_shufps(cp->func, dst, dst, shuf); + } +} + +/* load masks (pack into negs??) + * pshufd - shuffle according to writemask + * and - result, mask + * nand - dest, mask + * or - dest, result + */ +static boolean mask_write( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg result, + unsigned mask ) +{ + struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + emit_pshufd(cp, tmp, imm_swz, + SHUF((mask & 1) ? 2 : 3, + (mask & 2) ? 2 : 3, + (mask & 4) ? 2 : 3, + (mask & 8) ? 2 : 3)); + + sse_andps(cp->func, dst, tmp); + sse_andnps(cp->func, tmp, result); + sse_orps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + + + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy2( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + struct x86_reg arg1, + ubyte shuf ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + + emit_pshufd(cp, dst, arg1, shuf); + emit_pshufd(cp, tmp, arg0, shuf); + sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); + emit_pshufd(cp, dst, dst, shuf); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + + + +#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) + + +/* Locate a source register and perform any required (simple) swizzle. + * + * Just fail on complex swizzles at this point. + */ +static struct x86_reg fetch_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src ) +{ + struct x86_reg arg0 = aos_get_shader_reg(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + unsigned i; + ubyte swz = 0; + unsigned negs = 0; + unsigned abs = 0; + + for (i = 0; i < 4; i++) { + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + case TGSI_EXTSWIZZLE_ONE: + ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2"); + break; + + default: + swz |= (swizzle & 0x3) << (i * 2); + break; + } + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + negs |= (1<<i); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + + case TGSI_UTIL_SIGN_CLEAR: + abs |= (1<<i); + break; + + default: + ERROR(cp, "unsupported sign-mode"); + break; + } + } + + if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) { + struct x86_reg dst = aos_get_xmm_reg(cp); + + if (swz != SSE_SWIZZLE_NOOP) + emit_pshufd(cp, dst, arg0, swz); + else + sse_movaps(cp->func, dst, arg0); + + if (negs && negs != 0xf) { + struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + /* Load 1,-1,0,0 + * Use neg as arg to pshufd + * Multiply + */ + emit_pshufd(cp, tmp, imm_swz, + SHUF((negs & 1) ? 1 : 0, + (negs & 2) ? 1 : 0, + (negs & 4) ? 1 : 0, + (negs & 8) ? 1 : 0)); + sse_mulps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + } + else if (negs) { + struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); + sse_mulps(cp->func, dst, imm_negs); + } + + + if (abs && abs != 0xf) { + ERROR(cp, "unsupported partial abs"); + } + else if (abs) { + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movaps(cp->func, tmp, dst); + sse_mulps(cp->func, tmp, neg); + sse_maxps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + } + + return dst; + } + + return arg0; +} + +static void x87_fld_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src, + unsigned channel ) +{ + struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + x87_fldz( cp->func ); + break; + + case TGSI_EXTSWIZZLE_ONE: + x87_fld1( cp->func ); + break; + + default: + x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); + break; + } + + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + /* Flip the sign: + */ + x87_fchs( cp->func ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + + case TGSI_UTIL_SIGN_CLEAR: + x87_fabs( cp->func ); + break; + + case TGSI_UTIL_SIGN_SET: + x87_fabs( cp->func ); + x87_fchs( cp->func ); + break; + + default: + ERROR(cp, "unsupported sign-mode"); + break; + } +} + + + + + + +/* Used to implement write masking. This and most of the other instructions + * here would be easier to implement if there had been a translation + * to a 2 argument format (dst/arg0, arg1) at the shader level before + * attempting to translate to x86/sse code. + */ +static void store_dest( struct aos_compilation *cp, + const struct tgsi_full_dst_register *reg, + struct x86_reg result ) +{ + struct x86_reg dst; + + switch (reg->DstRegister.WriteMask) { + case 0: + return; + + case TGSI_WRITEMASK_XYZW: + aos_adopt_xmm_reg(cp, + get_xmm_writable(cp, result), + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + return; + default: + break; + } + + dst = aos_get_shader_reg_xmm(cp, + reg->DstRegister.File, + reg->DstRegister.Index); + + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + sse_movss(cp->func, dst, get_xmm(cp, result)); + break; + + case TGSI_WRITEMASK_ZW: + sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); + break; + + case TGSI_WRITEMASK_XY: + result = get_xmm_writable(cp, result); + sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); + dst = result; + break; + + case TGSI_WRITEMASK_YZW: + result = get_xmm_writable(cp, result); + sse_movss(cp->func, result, dst); + dst = result; + break; + + default: + mask_write(cp, dst, result, reg->DstRegister.WriteMask); + break; + } + + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + +} + +static void inject_scalar( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg result, + ubyte swizzle ) +{ + sse_shufps(cp->func, dst, dst, swizzle); + sse_movss(cp->func, dst, result); + sse_shufps(cp->func, dst, dst, swizzle); +} + + +static void store_scalar_dest( struct aos_compilation *cp, + const struct tgsi_full_dst_register *reg, + struct x86_reg result ) +{ + unsigned writemask = reg->DstRegister.WriteMask; + struct x86_reg dst; + + if (writemask != TGSI_WRITEMASK_X && + writemask != TGSI_WRITEMASK_Y && + writemask != TGSI_WRITEMASK_Z && + writemask != TGSI_WRITEMASK_W && + writemask != 0) + { + result = get_xmm_writable(cp, result); /* already true, right? */ + sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); + store_dest(cp, reg, result); + return; + } + + result = get_xmm(cp, result); + dst = aos_get_shader_reg_xmm(cp, + reg->DstRegister.File, + reg->DstRegister.Index); + + + + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + sse_movss(cp->func, dst, result); + break; + + case TGSI_WRITEMASK_Y: + inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); + break; + + case TGSI_WRITEMASK_Z: + inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); + break; + + case TGSI_WRITEMASK_W: + inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); + break; + + default: + break; + } + + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); +} + + + +static void x87_fst_or_nop( struct x86_function *func, + unsigned writemask, + unsigned channel, + struct x86_reg ptr ) +{ + assert(ptr.file == file_REG32); + if (writemask & (1<<channel)) + x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) ); +} + +static void x87_fstp_or_pop( struct x86_function *func, + unsigned writemask, + unsigned channel, + struct x86_reg ptr ) +{ + assert(ptr.file == file_REG32); + if (writemask & (1<<channel)) + x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) ); + else + x87_fstp( func, x86_make_reg( file_x87, 0 )); +} + + + +/* + */ +static void x87_fstp_dest4( struct aos_compilation *cp, + const struct tgsi_full_dst_register *dst ) +{ + struct x86_reg ptr = get_dst_ptr(cp, dst); + unsigned writemask = dst->DstRegister.WriteMask; + + x87_fst_or_nop(cp->func, writemask, 0, ptr); + x87_fst_or_nop(cp->func, writemask, 1, ptr); + x87_fst_or_nop(cp->func, writemask, 2, ptr); + x87_fstp_or_pop(cp->func, writemask, 3, ptr); +} + +/* Save current x87 state and put it into single precision mode. + */ +static void save_fpu_state( struct aos_compilation *cp ) +{ + x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_restore))); +} + +static void restore_fpu_state( struct aos_compilation *cp ) +{ + x87_fnclex(cp->func); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_restore))); +} + +static void set_fpu_round_neg_inf( struct aos_compilation *cp ) +{ + if (cp->fpucntl != FPU_RND_NEG) { + cp->fpucntl = FPU_RND_NEG; + x87_fnclex(cp->func); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_rnd_neg_inf))); + } +} + +static void set_fpu_round_nearest( struct aos_compilation *cp ) +{ + if (cp->fpucntl != FPU_RND_NEAREST) { + cp->fpucntl = FPU_RND_NEAREST; + x87_fnclex(cp->func); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_rnd_nearest))); + } +} + +#if 0 +static void x87_emit_ex2( struct aos_compilation *cp ) +{ + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + int stack = cp->func->x87_stack; + +// set_fpu_round_neg_inf( cp ); + + x87_fld(cp->func, st0); /* a a */ + x87_fprndint( cp->func ); /* int(a) a*/ + x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */ + x87_fxch(cp->func, st1); /* frc(a) int(a) */ + x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */ + x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */ + x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ + x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */ + /* 2^a int(a) */ + x87_fstp(cp->func, st1); /* 2^a */ + + assert( stack == cp->func->x87_stack); + +} +#endif + +#if 0 +static void PIPE_CDECL print_reg( const char *msg, + const float *reg ) +{ + debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); +} +#endif + +#if 0 +static void emit_print( struct aos_compilation *cp, + const char *message, /* must point to a static string! */ + unsigned file, + unsigned idx ) +{ + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); + unsigned i; + + /* There shouldn't be anything on the x87 stack. Can add this + * capacity later if need be. + */ + assert(cp->func->x87_stack == 0); + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. We're obviously not concerned about performance on this + * debug path, so here goes: + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + + /* Push the arguments: + */ + x86_lea( cp->func, ecx, arg ); + x86_push( cp->func, ecx ); + x86_push_imm32( cp->func, (int)message ); + + /* Call the helper. Could call debug_printf directly, but + * print_reg is a nice place to put a breakpoint if need be. + */ + x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); + x86_call( cp->func, ecx ); + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + + /* Pop caller-save regs + */ + x86_cdecl_caller_pop_regs( cp->func ); + + /* Done... + */ +} +#endif + +/** + * The traditional instructions. All operate on internal registers + * and ignore write masks and swizzling issues. + */ + +static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movaps(cp->func, tmp, arg0); + sse_mulps(cp->func, tmp, neg); + sse_maxps(cp->func, tmp, arg0); + + store_dest(cp, &op->FullDstRegisters[0], tmp); + return TRUE; +} + +static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_addps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fcos(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + +/* The dotproduct instructions don't really do that well in sse: + * XXX: produces wrong results -- disabled. + */ +static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_mulps(cp->func, dst, arg1); + /* Now the hard bit: sum the first 3 values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values (from DP3): + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); + sse_addss(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +/* dst[0] = 1.0 * 1.0F; */ +/* dst[1] = arg0[1] * arg1[1]; */ +/* dst[2] = arg0[2] * 1.0; */ +/* dst[3] = 1.0 * arg1[3]; */ + + emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); + emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); + sse_mulps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld1(cp->func); /* 1 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */ + x87_fyl2x(cp->func); /* log2(a0) */ + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + +#if 0 +static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_emit_ex2(cp); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} +#endif + + +static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 3; i >= 0; i--) { + if (writemask & (1<<i)) { + x87_fld_src(cp, &op->FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<<i)) { + x87_fprndint( cp->func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_nearest( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 3; i >= 0; i--) { + if (writemask & (1<<i)) { + x87_fld_src(cp, &op->FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<<i)) { + x87_fprndint( cp->func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* suck all the source values onto the stack before writing out any + * dst, which may alias... + */ + for (i = 3; i >= 0; i--) { + if (writemask & (1<<i)) { + x87_fld_src(cp, &op->FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<<i)) { + x87_fld(cp->func, st0); /* a a */ + x87_fprndint( cp->func ); /* flr(a) a */ + x87_fsubp(cp->func, st1); /* frc(a) */ + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + + + + + +static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + unsigned lit_count = cp->lit_count++; + struct x86_reg result, arg0; + unsigned i; + +#if 1 + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +#endif + + if (writemask != TGSI_WRITEMASK_XYZW) + result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); + else + result = get_dst_ptr(cp, &op->FullDstRegisters[0]); + + + arg0 = fetch_src( cp, &op->FullSrcRegisters[0] ); + if (arg0.file == file_XMM) { + struct x86_reg tmp = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, tmp[1])); + sse_movaps( cp->func, tmp, arg0 ); + arg0 = tmp; + } + + + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + /* Push the arguments: + */ + x86_push_imm32( cp->func, lit_count ); + + x86_lea( cp->func, ecx, arg0 ); + x86_push( cp->func, ecx ); + + x86_lea( cp->func, ecx, result ); + x86_push( cp->func, ecx ); + + x86_push( cp->func, cp->machine_EDX ); + + if (lit_count < MAX_LIT_INFO) { + x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, + Offset(struct aos_machine, lit_info) + + lit_count * sizeof(struct lit_info) + + Offset(struct lit_info, func))); + } + else { + x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit ); + } + + x86_call( cp->func, ecx ); + + x86_pop( cp->func, ecx ); /* fixme... */ + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + + x86_cdecl_caller_pop_regs( cp->func ); + + if (writemask != TGSI_WRITEMASK_XYZW) { + store_dest( cp, + &op->FullDstRegisters[0], + get_xmm_writable( cp, result ) ); + } + + return TRUE; +} + +#if 0 +static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + if (writemask & TGSI_WRITEMASK_YZ) { + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st2 = x86_make_reg(file_x87, 2); + + /* a1' = a1 <= 0 ? 1 : a1; + */ + x87_fldz(cp->func); /* 1 0 */ +#if 1 + x87_fld1(cp->func); /* 1 0 */ +#else + /* Correct but slow due to fp exceptions generated in fyl2x - fix me. + */ + x87_fldz(cp->func); /* 1 0 */ +#endif + x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */ + x87_fcomi(cp->func, st2); /* a1 1 0 */ + x87_fcmovb(cp->func, st1); /* a1' 1 0 */ + x87_fstp(cp->func, st1); /* a1' 0 */ + x87_fstp(cp->func, st1); /* a1' */ + + x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */ + x87_fxch(cp->func, st1); /* a1' a3 */ + + + /* Compute pow(a1, a3) + */ + x87_fyl2x(cp->func); /* a3*log2(a1) */ + x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */ + + + /* a0' = max2(a0, 0): + */ + x87_fldz(cp->func); /* 0 r2 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */ + x87_fcomi(cp->func, st1); + x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ + + x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ + + x87_fcomi(cp->func, st1); /* a0' 0 r2 */ + x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ + x87_fpop(cp->func); /* r2 */ + x87_fpop(cp->func); + } + + if (writemask & TGSI_WRITEMASK_XW) { + x87_fld1(cp->func); + x87_fst_or_nop(cp->func, writemask, 0, dst); + x87_fstp_or_pop(cp->func, writemask, 3, dst); + } + + return TRUE; +} +#endif + + + +static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_maxps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_minps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + /* potentially nothing to do */ + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_mulps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); + + /* If we can't clobber old contents of arg0, get a temporary & copy + * it there, then clobber it... + */ + arg0 = get_xmm_writable(cp, arg0); + + sse_mulps(cp->func, arg0, arg1); + sse_addps(cp->func, arg0, arg2); + store_dest(cp, &op->FullDstRegisters[0], arg0); + return TRUE; +} + + + +/* A wrapper for powf(). + * Makes sure it is cdecl and operates on floats. + */ +static float PIPE_CDECL _powerf( float x, float y ) +{ +#if FAST_MATH + return util_fast_pow(x, y); +#else + return powf( x, y ); +#endif +} + +#if FAST_MATH +static float PIPE_CDECL _exp2(float x) +{ + return util_fast_exp2(x); +} +#endif + + +/* Really not sufficient -- need to check for conditions that could + * generate inf/nan values, which will slow things down hugely. + */ +static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +#if 0 + x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */ + x87_fyl2x(cp->func); /* a1*log2(a0) */ + + x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */ + + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +#else + uint i; + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) ); + + x87_fld_src( cp, &op->FullSrcRegisters[1], 0 ); + x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) ); + x87_fld_src( cp, &op->FullSrcRegisters[0], 0 ); + x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); + + /* tmp_EAX has been pushed & will be restored below */ + x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf ); + x86_call( cp->func, cp->tmp_EAX ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) ); + + x86_cdecl_caller_pop_regs( cp->func ); + + /* Note retval on x87 stack: + */ + cp->func->x87_stack++; + + x87_fstp_dest4( cp, &op->FullDstRegisters[0] ); +#endif + return TRUE; +} + + +#if FAST_MATH +static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + uint i; + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); + + x87_fld_src( cp, &op->FullSrcRegisters[0], 0 ); + x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); + + /* tmp_EAX has been pushed & will be restored below */ + x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); + x86_call( cp->func, cp->tmp_EAX ); + + x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); + + x86_cdecl_caller_pop_regs( cp->func ); + + /* Note retval on x87 stack: + */ + cp->func->x87_stack++; + + x87_fstp_dest4( cp, &op->FullDstRegisters[0] ); + + return TRUE; +} +#endif + + +static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + if (cp->have_sse2) { + sse2_rcpss(cp->func, dst, arg0); + /* extend precision here... + */ + } + else { + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + sse_movss(cp->func, dst, ones); + sse_divss(cp->func, dst, arg0); + } + + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +/* Although rsqrtps() and rcpps() are low precision on some/all SSE + * implementations, it is possible to improve its precision at + * fairly low cost, using a newton/raphson step, as below: + * + * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) + * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] + * or: + * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] + * + * + * See: http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + + if (0) { + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg r = aos_get_xmm_reg(cp); + sse_rsqrtss(cp->func, r, arg0); + store_scalar_dest(cp, &op->FullDstRegisters[0], r); + return TRUE; + } + else { + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg r = aos_get_xmm_reg(cp); + + struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); + struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); + struct x86_reg src = get_xmm_writable( cp, arg0 ); + + sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */ + sse_mulss( cp->func, src, neg_half ); /* -.5 * a */ + sse_mulss( cp->func, src, r ); /* -.5 * a * r */ + sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */ + sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */ + sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */ + + store_scalar_dest(cp, &op->FullDstRegisters[0], r); + return TRUE; + } +} + + +static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fsin(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + + +static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_cmpps(cp->func, dst, arg1, cc_LessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = get_xmm_writable(cp, arg0); + + sse_subps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + + sse2_cvttps2dq(cp->func, tmp0, arg0); + sse2_cvtdq2ps(cp->func, tmp0, tmp0); + + store_dest(cp, &op->FullDstRegisters[0], tmp0); + return TRUE; +} + +static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + struct x86_reg tmp1 = aos_get_xmm_reg(cp); + + emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); + sse_mulps(cp->func, tmp1, arg0); + emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); + sse_mulps(cp->func, tmp0, arg1); + sse_subps(cp->func, tmp1, tmp0); + sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); + +/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ +/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ +/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ +/* dst[3] is undef */ + + + aos_release_xmm_reg(cp, tmp0.idx); + store_dest(cp, &op->FullDstRegisters[0], tmp1); + return TRUE; +} + + + +static boolean +emit_instruction( struct aos_compilation *cp, + struct tgsi_full_instruction *inst ) +{ + x87_assert_stack_empty(cp->func); + + switch( inst->Instruction.Opcode ) { + case TGSI_OPCODE_MOV: + return emit_MOV( cp, inst ); + + case TGSI_OPCODE_LIT: + return emit_LIT(cp, inst); + + case TGSI_OPCODE_RCP: + return emit_RCP(cp, inst); + + case TGSI_OPCODE_RSQ: + return emit_RSQ(cp, inst); + + case TGSI_OPCODE_EXP: + /*return emit_EXP(cp, inst);*/ + return FALSE; + + case TGSI_OPCODE_LOG: + /*return emit_LOG(cp, inst);*/ + return FALSE; + + case TGSI_OPCODE_MUL: + return emit_MUL(cp, inst); + + case TGSI_OPCODE_ADD: + return emit_ADD(cp, inst); + + case TGSI_OPCODE_DP3: + return emit_DP3(cp, inst); + + case TGSI_OPCODE_DP4: + return emit_DP4(cp, inst); + + case TGSI_OPCODE_DST: + return emit_DST(cp, inst); + + case TGSI_OPCODE_MIN: + return emit_MIN(cp, inst); + + case TGSI_OPCODE_MAX: + return emit_MAX(cp, inst); + + case TGSI_OPCODE_SLT: + return emit_SLT(cp, inst); + + case TGSI_OPCODE_SGE: + return emit_SGE(cp, inst); + + case TGSI_OPCODE_MAD: + return emit_MAD(cp, inst); + + case TGSI_OPCODE_SUB: + return emit_SUB(cp, inst); + + case TGSI_OPCODE_LERP: +// return emit_LERP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FRAC: + return emit_FRC(cp, inst); + + case TGSI_OPCODE_CLAMP: +// return emit_CLAMP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FLOOR: + return emit_FLR(cp, inst); + + case TGSI_OPCODE_ROUND: + return emit_RND(cp, inst); + + case TGSI_OPCODE_EXPBASE2: +#if FAST_MATH + return emit_EXPBASE2(cp, inst); +#elif 0 + /* this seems to fail for "larger" exponents. + * See glean tvertProg1's EX2 test. + */ + return emit_EX2(cp, inst); +#else + return FALSE; +#endif + + case TGSI_OPCODE_LOGBASE2: + return emit_LG2(cp, inst); + + case TGSI_OPCODE_POWER: + return emit_POW(cp, inst); + + case TGSI_OPCODE_CROSSPRODUCT: + return emit_XPD(cp, inst); + + case TGSI_OPCODE_ABS: + return emit_ABS(cp, inst); + + case TGSI_OPCODE_DPH: + return emit_DPH(cp, inst); + + case TGSI_OPCODE_COS: + return emit_COS(cp, inst); + + case TGSI_OPCODE_SIN: + return emit_SIN(cp, inst); + + case TGSI_OPCODE_TRUNC: + return emit_TRUNC(cp, inst); + + case TGSI_OPCODE_END: + return TRUE; + + default: + return FALSE; + } +} + + +static boolean emit_viewport( struct aos_compilation *cp ) +{ + struct x86_reg pos = aos_get_shader_reg_xmm(cp, + TGSI_FILE_OUTPUT, + cp->vaos->draw->vs.position_output ); + + struct x86_reg scale = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, scale)); + + struct x86_reg translate = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, translate)); + + sse_mulps(cp->func, pos, scale); + sse_addps(cp->func, pos, translate); + + aos_adopt_xmm_reg( cp, + pos, + TGSI_FILE_OUTPUT, + cp->vaos->draw->vs.position_output, + TRUE ); + return TRUE; +} + + +/* This is useful to be able to see the results on softpipe. Doesn't + * do proper clipping, just assumes the backend can do it during + * rasterization -- for debug only... + */ +static boolean emit_rhw_viewport( struct aos_compilation *cp ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg pos = aos_get_shader_reg_xmm(cp, + TGSI_FILE_OUTPUT, + cp->vaos->draw->vs.position_output); + + struct x86_reg scale = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, scale)); + + struct x86_reg translate = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, translate)); + + + + emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); + sse2_rcpss(cp->func, tmp, tmp); + sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); + + sse_mulps(cp->func, pos, scale); + sse_mulps(cp->func, pos, tmp); + sse_addps(cp->func, pos, translate); + + /* Set pos[3] = w + */ + mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); + + aos_adopt_xmm_reg( cp, + pos, + TGSI_FILE_OUTPUT, + cp->vaos->draw->vs.position_output, + TRUE ); + return TRUE; +} + + +#if 0 +static boolean note_immediate( struct aos_compilation *cp, + struct tgsi_full_immediate *imm ) +{ + unsigned pos = cp->num_immediates++; + unsigned j; + + for (j = 0; j < imm->Immediate.NrTokens - 1; j++) { + cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float; + } + + return TRUE; +} +#endif + + + + +static void find_last_write_outputs( struct aos_compilation *cp ) +{ + struct tgsi_parse_context parse; + unsigned this_instruction = 0; + unsigned i; + + tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); + + while (!tgsi_parse_end_of_tokens( &parse )) { + + tgsi_parse_token( &parse ); + + if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) + continue; + + for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { + if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File == + TGSI_FILE_OUTPUT) + { + unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index; + cp->output_last_write[idx] = this_instruction; + } + } + + this_instruction++; + } + + tgsi_parse_free( &parse ); +} + + +#define ARG_MACHINE 1 +#define ARG_START_ELTS 2 +#define ARG_COUNT 3 +#define ARG_OUTBUF 4 + + +static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, + boolean linear ) +{ + struct tgsi_parse_context parse; + struct aos_compilation cp; + unsigned fixup, label; + + util_init_math(); + + tgsi_parse_init( &parse, varient->base.vs->state.tokens ); + + memset(&cp, 0, sizeof(cp)); + + cp.insn_counter = 1; + cp.vaos = varient; + cp.have_sse2 = 1; + cp.func = &varient->func[ linear ? 0 : 1 ]; + + cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); + cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); + cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); + cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); + cp.count_ESI = x86_make_reg(file_REG32, reg_SI); + cp.temp_EBP = x86_make_reg(file_REG32, reg_BP); + cp.stack_ESP = x86_make_reg( file_REG32, reg_SP ); + + x86_init_func(cp.func); + + find_last_write_outputs(&cp); + + x86_push(cp.func, cp.idx_EBX); + x86_push(cp.func, cp.count_ESI); + x86_push(cp.func, cp.temp_EBP); + + + /* Load arguments into regs: + */ + x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE)); + x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); + x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); + x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); + + + /* Compare count to zero and possibly bail. + */ + x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); + x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); + fixup = x86_jcc_forward(cp.func, cc_E); + + + save_fpu_state( &cp ); + set_fpu_round_nearest( &cp ); + + aos_init_inputs( &cp, linear ); + + cp.x86_reg[0] = 0; + cp.x86_reg[1] = 0; + + /* Note address for loop jump + */ + label = x86_get_label(cp.func); + { + /* Fetch inputs... TODO: fetch lazily... + */ + if (!aos_fetch_inputs( &cp, linear )) + goto fail; + + /* Emit the shader: + */ + while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) + { + tgsi_parse_token( &parse ); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: +#if 0 + if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) + goto fail; +#endif + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (DISASSEM) + tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); + + if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) + goto fail; + break; + } + + x87_assert_stack_empty(cp.func); + cp.insn_counter++; + + if (DISASSEM) + debug_printf("\n"); + } + + + { + unsigned i; + for (i = 0; i < 8; i++) { + if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { + cp.xmm[i].file = TGSI_FILE_NULL; + cp.xmm[i].dirty = 0; + } + } + } + + if (cp.error) + goto fail; + + if (cp.vaos->base.key.clip) { + /* not really handling clipping, just do the rhw so we can + * see the results... + */ + emit_rhw_viewport(&cp); + } + else if (cp.vaos->base.key.viewport) { + emit_viewport(&cp); + } + + /* Emit output... TODO: do this eagerly after the last write to a + * given output. + */ + if (!aos_emit_outputs( &cp )) + goto fail; + + + /* Next vertex: + */ + x86_lea(cp.func, + cp.outbuf_ECX, + x86_make_disp(cp.outbuf_ECX, + cp.vaos->base.key.output_stride)); + + /* Incr index + */ + aos_incr_inputs( &cp, linear ); + } + /* decr count, loop if not zero + */ + x86_dec(cp.func, cp.count_ESI); + x86_jcc(cp.func, cc_NZ, label); + + restore_fpu_state(&cp); + + /* Land forward jump here: + */ + x86_fixup_fwd_jump(cp.func, fixup); + + /* Exit mmx state? + */ + if (cp.func->need_emms) + mmx_emms(cp.func); + + x86_pop(cp.func, cp.temp_EBP); + x86_pop(cp.func, cp.count_ESI); + x86_pop(cp.func, cp.idx_EBX); + + x87_assert_stack_empty(cp.func); + x86_ret(cp.func); + + tgsi_parse_free( &parse ); + return !cp.error; + + fail: + tgsi_parse_free( &parse ); + return FALSE; +} + + + +static void vaos_set_buffer( struct draw_vs_varient *varient, + unsigned buf, + const void *ptr, + unsigned stride ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + if (buf < vaos->nr_vb) { + vaos->buffer[buf].base_ptr = (char *)ptr; + vaos->buffer[buf].stride = stride; + } + + if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); +} + + + +static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient, + const unsigned *elts, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + struct aos_machine *machine = vaos->draw->vs.aos_machine; + + if (0) debug_printf("%s %d\n", __FUNCTION__, count); + + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; + machine->constants = vaos->draw->vs.aligned_constants; + machine->immediates = vaos->base.vs->immediates; + machine->buffer = vaos->buffer; + + vaos->gen_run_elts( machine, + elts, + count, + output_buffer ); +} + +static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient, + unsigned start, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + struct aos_machine *machine = vaos->draw->vs.aos_machine; + + if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, + vaos->base.key.const_vbuffers); + + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; + machine->constants = vaos->draw->vs.aligned_constants; + machine->immediates = vaos->base.vs->immediates; + machine->buffer = vaos->buffer; + + vaos->gen_run_linear( machine, + start, + count, + output_buffer ); + + /* Sanity spot checks to make sure we didn't trash our constants */ + assert(machine->internal[IMM_ONES][0] == 1.0f); + assert(machine->internal[IMM_IDENTITY][0] == 0.0f); + assert(machine->internal[IMM_NEGS][0] == -1.0f); +} + + + +static void vaos_destroy( struct draw_vs_varient *varient ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + FREE( vaos->buffer ); + + x86_release_func( &vaos->func[0] ); + x86_release_func( &vaos->func[1] ); + + FREE(vaos); +} + + + +static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + unsigned i; + struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + + if (!vaos) + goto fail; + + vaos->base.key = *key; + vaos->base.vs = vs; + vaos->base.set_buffer = vaos_set_buffer; + vaos->base.destroy = vaos_destroy; + vaos->base.run_linear = vaos_run_linear; + vaos->base.run_elts = vaos_run_elts; + + vaos->draw = vs->draw; + + for (i = 0; i < key->nr_inputs; i++) + vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); + + vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); + if (!vaos->buffer) + goto fail; + + if (0) + debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); + +#if 0 + tgsi_dump(vs->state.tokens, 0); +#endif + + if (!build_vertex_program( vaos, TRUE )) + goto fail; + + if (!build_vertex_program( vaos, FALSE )) + goto fail; + + vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]); + if (!vaos->gen_run_linear) + goto fail; + + vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]); + if (!vaos->gen_run_elts) + goto fail; + + return &vaos->base; + + fail: + if (vaos && vaos->buffer) + FREE(vaos->buffer); + + if (vaos) + x86_release_func( &vaos->func[0] ); + + if (vaos) + x86_release_func( &vaos->func[1] ); + + FREE(vaos); + + return NULL; +} + + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient *varient = varient_aos_sse( vs, key ); + + if (varient == NULL) { + varient = draw_vs_varient_generic( vs, key ); + } + + return varient; +} + + + +#endif /* PIPE_ARCH_X86 */ |