diff options
author | José Fonseca <[email protected]> | 2011-11-08 00:10:47 +0000 |
---|---|---|
committer | José Fonseca <[email protected]> | 2011-11-08 22:57:34 +0000 |
commit | 4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 (patch) | |
tree | 857d6c1740eb32fc86744f7afd81322862f6150c /src/gallium/auxiliary/draw | |
parent | 207a016ecaabbccf865a5b8e026b95a4276adc15 (diff) |
Remove tgsi_sse2.
tgsi_exec is simple. llvm is fast. tgsi_sse2 ends up being neither.
Diffstat (limited to 'src/gallium/auxiliary/draw')
-rw-r--r-- | src/gallium/auxiliary/draw/draw_private.h | 4 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs.c | 27 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs.h | 20 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.c | 2267 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.h | 255 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos_io.c | 460 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos_machine.c | 328 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_ppc.c | 7 | ||||
-rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_sse.c | 225 |
9 files changed, 2 insertions, 3591 deletions
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index b84d2b77179..3521a035e2f 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -237,10 +237,6 @@ struct draw_context uint num_samplers; struct tgsi_sampler **samplers; - /* Here's another one: - */ - struct aos_machine *aos_machine; - const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS]; diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 1763dbc199f..957bbe57a82 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -81,14 +81,12 @@ draw_vs_set_constants(struct draw_context *draw, } draw->vs.aligned_constants[slot] = constants; - draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants); } void draw_vs_set_viewport( struct draw_context *draw, const struct pipe_viewport_state *viewport ) { - draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport ); } @@ -103,22 +101,8 @@ draw_create_vertex_shader(struct draw_context *draw, tgsi_dump(shader->tokens, 0); } - if (!draw->pt.middle.llvm) { -#if 0 -/* these paths don't support vertex clamping - * TODO: either add it, or remove them completely - * use LLVM instead if you want performance - * use exec instead if you want debugging/more correctness - */ -#if defined(PIPE_ARCH_X86) - vs = draw_create_vs_sse( draw, shader ); -#elif defined(PIPE_ARCH_PPC) - vs = draw_create_vs_ppc( draw, shader ); -#endif -#endif - } #if HAVE_LLVM - else { + if (draw->pt.middle.llvm) { vs = draw_create_vs_llvm(draw, shader); } #endif @@ -199,12 +183,6 @@ draw_vs_init( struct draw_context *draw ) if (!draw->vs.fetch_cache) return FALSE; - draw->vs.aos_machine = draw_vs_aos_machine(); -#ifdef PIPE_ARCH_X86 - if (!draw->vs.aos_machine) - return FALSE; -#endif - return TRUE; } @@ -219,9 +197,6 @@ draw_vs_destroy( struct draw_context *draw ) if (draw->vs.emit_cache) translate_cache_destroy(draw->vs.emit_cache); - if (draw->vs.aos_machine) - draw_vs_aos_machine_destroy(draw->vs.aos_machine); - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { if (draw->vs.aligned_constant_storage[i]) { align_free((void *)draw->vs.aligned_constant_storage[i]); diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index e6d187e9774..49229f8164b 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -159,10 +159,6 @@ draw_create_vs_exec(struct draw_context *draw, const struct pipe_shader_state *templ); struct draw_vertex_shader * -draw_create_vs_sse(struct draw_context *draw, - const struct pipe_shader_state *templ); - -struct draw_vertex_shader * draw_create_vs_ppc(struct draw_context *draw, const struct pipe_shader_state *templ); @@ -170,10 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw, struct draw_vs_variant_key; struct draw_vertex_shader; -struct draw_vs_variant * -draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ); - #if HAVE_LLVM struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, @@ -214,18 +206,6 @@ static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key } -struct aos_machine *draw_vs_aos_machine( void ); -void draw_vs_aos_machine_destroy( struct aos_machine *machine ); - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants); - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ); - - #define MAX_TGSI_VERTICES 4 diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c deleted file mode 100644 index 7b90dba0cd5..00000000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ /dev/null @@ -1,2267 +0,0 @@ -/* - * Mesa 3-D graphics library - * Version: 6.3 - * - * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code - * using the rtasm runtime assembler. Based on the old - * t_vb_arb_program_sse.c - */ - - -#include "util/u_memory.h" -#include "util/u_math.h" -#include "pipe/p_shader_tokens.h" -#include "util/u_debug.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "tgsi/tgsi_dump.h" - -#include "draw_vs.h" -#include "draw_vs_aos.h" - -#include "rtasm/rtasm_x86sse.h" - -#ifdef PIPE_ARCH_X86 -#define DISASSEM 0 -#define FAST_MATH 1 - -static const char *files[] = -{ - "NULL", - "CONST", - "IN", - "OUT", - "TEMP", - "SAMP", - "ADDR", - "IMM", - "INTERNAL", -}; - -static INLINE boolean eq( struct x86_reg a, - struct x86_reg b ) -{ - return (a.file == b.file && - a.idx == b.idx && - a.mod == b.mod && - a.disp == b.disp); -} - -struct x86_reg aos_get_x86( struct aos_compilation *cp, - unsigned which_reg, /* quick hack */ - unsigned value ) -{ - struct x86_reg reg; - - if (which_reg == 0) - reg = cp->temp_EBP; - else - reg = cp->tmp_EAX; - - if (cp->x86_reg[which_reg] != value) { - unsigned offset; - - switch (value) { - case X86_IMMEDIATES: - assert(which_reg == 0); - offset = Offset(struct aos_machine, immediates); - break; - case X86_CONSTANTS: - assert(which_reg == 1); - offset = Offset(struct aos_machine, constants); - break; - case X86_BUFFERS: - assert(which_reg == 0); - offset = Offset(struct aos_machine, buffer); - break; - default: - assert(0); - offset = 0; - } - - - x86_mov(cp->func, reg, - x86_make_disp(cp->machine_EDX, offset)); - - cp->x86_reg[which_reg] = value; - } - - return reg; -} - - -static struct x86_reg get_reg_ptr(struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - struct x86_reg ptr = cp->machine_EDX; - - switch (file) { - case TGSI_FILE_INPUT: - assert(idx < MAX_INPUTS); - return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); - - case TGSI_FILE_OUTPUT: - return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); - - case TGSI_FILE_TEMPORARY: - assert(idx < MAX_TEMPS); - return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); - - case AOS_FILE_INTERNAL: - assert(idx < MAX_INTERNALS); - return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); - - case TGSI_FILE_IMMEDIATE: - assert(idx < MAX_IMMEDIATES); /* just a sanity check */ - return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float)); - - case TGSI_FILE_CONSTANT: - assert(idx < MAX_CONSTANTS); /* just a sanity check */ - return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float)); - - default: - AOS_ERROR(cp, "unknown reg file"); - return x86_make_reg(0,0); - } -} - - - -#define X87_CW_EXCEPTION_INV_OP (1<<0) -#define X87_CW_EXCEPTION_DENORM_OP (1<<1) -#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) -#define X87_CW_EXCEPTION_OVERFLOW (1<<3) -#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) -#define X87_CW_EXCEPTION_PRECISION (1<<5) -#define X87_CW_PRECISION_SINGLE (0<<8) -#define X87_CW_PRECISION_RESERVED (1<<8) -#define X87_CW_PRECISION_DOUBLE (2<<8) -#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) -#define X87_CW_PRECISION_MASK (3<<8) -#define X87_CW_ROUND_NEAREST (0<<10) -#define X87_CW_ROUND_DOWN (1<<10) -#define X87_CW_ROUND_UP (2<<10) -#define X87_CW_ROUND_ZERO (3<<10) -#define X87_CW_ROUND_MASK (3<<10) -#define X87_CW_INFINITY (1<<12) - - - - -static void spill( struct aos_compilation *cp, unsigned idx ) -{ - if (!cp->xmm[idx].dirty || - (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ - cp->xmm[idx].file != TGSI_FILE_OUTPUT && - cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { - AOS_ERROR(cp, "invalid spill"); - return; - } - else { - struct x86_reg oldval = get_reg_ptr(cp, - cp->xmm[idx].file, - cp->xmm[idx].idx); - - if (0) debug_printf("\nspill %s[%d]", - files[cp->xmm[idx].file], - cp->xmm[idx].idx); - - assert(cp->xmm[idx].dirty); - sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); - cp->xmm[idx].dirty = 0; - } -} - - -void aos_spill_all( struct aos_compilation *cp ) -{ - unsigned i; - - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } -} - - -static struct x86_reg get_xmm_writable( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file != file_XMM || - cp->xmm[reg.idx].file != TGSI_FILE_NULL) - { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movaps(cp->func, tmp, reg); - reg = tmp; - } - - cp->xmm[reg.idx].last_used = cp->insn_counter; - return reg; -} - -static struct x86_reg get_xmm( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file != file_XMM) - { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movaps(cp->func, tmp, reg); - reg = tmp; - } - - cp->xmm[reg.idx].last_used = cp->insn_counter; - return reg; -} - - -/* Allocate an empty xmm register, either as a temporary or later to - * "adopt" as a shader reg. - */ -struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) -{ - unsigned i; - unsigned oldest = 0; - boolean found = FALSE; - - for (i = 0; i < 8; i++) - if (cp->xmm[i].last_used != cp->insn_counter && - cp->xmm[i].file == TGSI_FILE_NULL) { - oldest = i; - found = TRUE; - } - - if (!found) { - for (i = 0; i < 8; i++) - if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) - oldest = i; - } - - /* Need to write out the old value? - */ - if (cp->xmm[oldest].dirty) - spill(cp, oldest); - - assert(cp->xmm[oldest].last_used != cp->insn_counter); - - cp->xmm[oldest].file = TGSI_FILE_NULL; - cp->xmm[oldest].idx = 0; - cp->xmm[oldest].dirty = 0; - cp->xmm[oldest].last_used = cp->insn_counter; - return x86_make_reg(file_XMM, oldest); -} - -void aos_release_xmm_reg( struct aos_compilation *cp, - unsigned idx ) -{ - cp->xmm[idx].file = TGSI_FILE_NULL; - cp->xmm[idx].idx = 0; - cp->xmm[idx].dirty = 0; - cp->xmm[idx].last_used = 0; -} - - -static void aos_soft_release_xmm( struct aos_compilation *cp, - struct x86_reg reg ) -{ - if (reg.file == file_XMM) { - assert(cp->xmm[reg.idx].last_used == cp->insn_counter); - cp->xmm[reg.idx].last_used = cp->insn_counter - 1; - } -} - - - -/* Mark an xmm reg as holding the current copy of a shader reg. - */ -void aos_adopt_xmm_reg( struct aos_compilation *cp, - struct x86_reg reg, - unsigned file, - unsigned idx, - unsigned dirty ) -{ - unsigned i; - - if (reg.file != file_XMM) { - assert(0); - return; - } - - - /* If any xmm reg thinks it holds this shader reg, break the - * illusion. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - /* If an xmm reg is already holding this shader reg, take into account its - * dirty flag... - */ - dirty |= cp->xmm[i].dirty; - aos_release_xmm_reg(cp, i); - } - } - - cp->xmm[reg.idx].file = file; - cp->xmm[reg.idx].idx = idx; - cp->xmm[reg.idx].dirty = dirty; - cp->xmm[reg.idx].last_used = cp->insn_counter; -} - - -/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. - */ -static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - unsigned i; - - /* Ensure the in-memory copy of this reg is up-to-date - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx && - cp->xmm[i].dirty) { - spill(cp, i); - } - } - - return get_reg_ptr( cp, file, idx ); -} - - -/* As above, but return a pointer. Note - this pointer may alias - * those returned by get_arg_ptr(). - */ -static struct x86_reg get_dst_ptr( struct aos_compilation *cp, - const struct tgsi_full_dst_register *dst ) -{ - unsigned file = dst->Register.File; - unsigned idx = dst->Register.Index; - unsigned i; - - - /* Ensure in-memory copy of this reg is up-to-date and invalidate - * any xmm copies. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - if (cp->xmm[i].dirty) - spill(cp, i); - - aos_release_xmm_reg(cp, i); - } - } - - return get_reg_ptr( cp, file, idx ); -} - - - - - -/* Return an XMM reg if the argument is resident, otherwise return a - * base+offset pointer to the saved value. - */ -struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - unsigned i; - - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) - { - cp->xmm[i].last_used = cp->insn_counter; - return x86_make_reg(file_XMM, i); - } - } - - /* If not found in the XMM register file, return an indirect - * reference to the in-memory copy: - */ - return get_reg_ptr( cp, file, idx ); -} - - - -static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, - unsigned file, - unsigned idx ) -{ - struct x86_reg reg = get_xmm( cp, - aos_get_shader_reg( cp, file, idx ) ); - - aos_adopt_xmm_reg( cp, - reg, - file, - idx, - FALSE ); - - return reg; -} - - - -struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, - unsigned imm ) -{ - return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); -} - - -struct x86_reg aos_get_internal( struct aos_compilation *cp, - unsigned imm ) -{ - return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); -} - - - - - -/* Emulate pshufd insn in regular SSE, if necessary: - */ -static void emit_pshufd( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg arg0, - ubyte shuf ) -{ - if (cp->have_sse2) { - sse2_pshufd(cp->func, dst, arg0, shuf); - } - else { - if (!eq(dst, arg0)) - sse_movaps(cp->func, dst, arg0); - - sse_shufps(cp->func, dst, dst, shuf); - } -} - -/* load masks (pack into negs??) - * pshufd - shuffle according to writemask - * and - result, mask - * nand - dest, mask - * or - dest, result - */ -static boolean mask_write( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg result, - unsigned mask ) -{ - struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - emit_pshufd(cp, tmp, imm_swz, - SHUF((mask & 1) ? 2 : 3, - (mask & 2) ? 2 : 3, - (mask & 4) ? 2 : 3, - (mask & 8) ? 2 : 3)); - - sse_andps(cp->func, dst, tmp); - sse_andnps(cp->func, tmp, result); - sse_orps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - return TRUE; -} - - - - -/* Helper for writemask: - */ -static boolean emit_shuf_copy2( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg arg0, - struct x86_reg arg1, - ubyte shuf ) -{ - struct x86_reg tmp = aos_get_xmm_reg(cp); - - emit_pshufd(cp, dst, arg1, shuf); - emit_pshufd(cp, tmp, arg0, shuf); - sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); - emit_pshufd(cp, dst, dst, shuf); - - aos_release_xmm_reg(cp, tmp.idx); - return TRUE; -} - - - -#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) - - -/* Locate a source register and perform any required (simple) swizzle. - * - * Just fail on complex swizzles at this point. - */ -static struct x86_reg fetch_src( struct aos_compilation *cp, - const struct tgsi_full_src_register *src ) -{ - struct x86_reg arg0 = aos_get_shader_reg(cp, - src->Register.File, - src->Register.Index); - unsigned i; - ubyte swz = 0; - unsigned negs = 0; - unsigned abs = 0; - - for (i = 0; i < 4; i++) { - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i ); - unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); - - swz |= (swizzle & 0x3) << (i * 2); - - switch (neg) { - case TGSI_UTIL_SIGN_TOGGLE: - negs |= (1<<i); - break; - - case TGSI_UTIL_SIGN_KEEP: - break; - - case TGSI_UTIL_SIGN_CLEAR: - abs |= (1<<i); - break; - - default: - AOS_ERROR(cp, "unsupported sign-mode"); - break; - } - } - - if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) { - struct x86_reg dst = aos_get_xmm_reg(cp); - - if (swz != SSE_SWIZZLE_NOOP) - emit_pshufd(cp, dst, arg0, swz); - else - sse_movaps(cp->func, dst, arg0); - - if (negs && negs != 0xf) { - struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - /* Load 1,-1,0,0 - * Use neg as arg to pshufd - * Multiply - */ - emit_pshufd(cp, tmp, imm_swz, - SHUF((negs & 1) ? 1 : 0, - (negs & 2) ? 1 : 0, - (negs & 4) ? 1 : 0, - (negs & 8) ? 1 : 0)); - sse_mulps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - aos_soft_release_xmm(cp, imm_swz); - } - else if (negs) { - struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); - sse_mulps(cp->func, dst, imm_negs); - aos_soft_release_xmm(cp, imm_negs); - } - - - if (abs && abs != 0xf) { - AOS_ERROR(cp, "unsupported partial abs"); - } - else if (abs) { - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, dst); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - aos_soft_release_xmm(cp, neg); - } - - aos_soft_release_xmm(cp, arg0); - return dst; - } - - return arg0; -} - -static void x87_fld_src( struct aos_compilation *cp, - const struct tgsi_full_src_register *src, - unsigned channel ) -{ - struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, - src->Register.File, - src->Register.Index); - - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel ); - unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); - - x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); - - switch (neg) { - case TGSI_UTIL_SIGN_TOGGLE: - /* Flip the sign: - */ - x87_fchs( cp->func ); - break; - - case TGSI_UTIL_SIGN_KEEP: - break; - - case TGSI_UTIL_SIGN_CLEAR: - x87_fabs( cp->func ); - break; - - case TGSI_UTIL_SIGN_SET: - x87_fabs( cp->func ); - x87_fchs( cp->func ); - break; - - default: - AOS_ERROR(cp, "unsupported sign-mode"); - break; - } -} - - - - - - -/* Used to implement write masking. This and most of the other instructions - * here would be easier to implement if there had been a translation - * to a 2 argument format (dst/arg0, arg1) at the shader level before - * attempting to translate to x86/sse code. - */ -static void store_dest( struct aos_compilation *cp, - const struct tgsi_full_dst_register *reg, - struct x86_reg result ) -{ - struct x86_reg dst; - - switch (reg->Register.WriteMask) { - case 0: - return; - - case TGSI_WRITEMASK_XYZW: - aos_adopt_xmm_reg(cp, - get_xmm_writable(cp, result), - reg->Register.File, - reg->Register.Index, - TRUE); - return; - default: - break; - } - - dst = aos_get_shader_reg_xmm(cp, - reg->Register.File, - reg->Register.Index); - - switch (reg->Register.WriteMask) { - case TGSI_WRITEMASK_X: - sse_movss(cp->func, dst, get_xmm(cp, result)); - break; - - case TGSI_WRITEMASK_ZW: - sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); - break; - - case TGSI_WRITEMASK_XY: - result = get_xmm_writable(cp, result); - sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); - dst = result; - break; - - case TGSI_WRITEMASK_YZW: - result = get_xmm_writable(cp, result); - sse_movss(cp->func, result, dst); - dst = result; - break; - - default: - mask_write(cp, dst, result, reg->Register.WriteMask); - break; - } - - aos_adopt_xmm_reg(cp, - dst, - reg->Register.File, - reg->Register.Index, - TRUE); - -} - -static void inject_scalar( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg result, - ubyte swizzle ) -{ - sse_shufps(cp->func, dst, dst, swizzle); - sse_movss(cp->func, dst, result); - sse_shufps(cp->func, dst, dst, swizzle); -} - - -static void store_scalar_dest( struct aos_compilation *cp, - const struct tgsi_full_dst_register *reg, - struct x86_reg result ) -{ - unsigned writemask = reg->Register.WriteMask; - struct x86_reg dst; - - if (writemask != TGSI_WRITEMASK_X && - writemask != TGSI_WRITEMASK_Y && - writemask != TGSI_WRITEMASK_Z && - writemask != TGSI_WRITEMASK_W && - writemask != 0) - { - result = get_xmm_writable(cp, result); /* already true, right? */ - sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); - store_dest(cp, reg, result); - return; - } - - result = get_xmm(cp, result); - dst = aos_get_shader_reg_xmm(cp, - reg->Register.File, - reg->Register.Index); - - - - switch (reg->Register.WriteMask) { - case TGSI_WRITEMASK_X: - sse_movss(cp->func, dst, result); - break; - - case TGSI_WRITEMASK_Y: - inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); - break; - - case TGSI_WRITEMASK_Z: - inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); - break; - - case TGSI_WRITEMASK_W: - inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); - break; - - default: - break; - } - - aos_adopt_xmm_reg(cp, - dst, - reg->Register.File, - reg->Register.Index, - TRUE); -} - - - -static void x87_fst_or_nop( struct x86_function *func, - unsigned writemask, - unsigned channel, - struct x86_reg ptr ) -{ - assert(ptr.file == file_REG32); - if (writemask & (1<<channel)) - x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) ); -} - -static void x87_fstp_or_pop( struct x86_function *func, - unsigned writemask, - unsigned channel, - struct x86_reg ptr ) -{ - assert(ptr.file == file_REG32); - if (writemask & (1<<channel)) - x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) ); - else - x87_fstp( func, x86_make_reg( file_x87, 0 )); -} - - - -/* - */ -static void x87_fstp_dest4( struct aos_compilation *cp, - const struct tgsi_full_dst_register *dst ) -{ - struct x86_reg ptr = get_dst_ptr(cp, dst); - unsigned writemask = dst->Register.WriteMask; - - x87_fst_or_nop(cp->func, writemask, 0, ptr); - x87_fst_or_nop(cp->func, writemask, 1, ptr); - x87_fst_or_nop(cp->func, writemask, 2, ptr); - x87_fstp_or_pop(cp->func, writemask, 3, ptr); -} - -/* Save current x87 state and put it into single precision mode. - */ -static void save_fpu_state( struct aos_compilation *cp ) -{ - x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_restore))); -} - -static void restore_fpu_state( struct aos_compilation *cp ) -{ - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_restore))); -} - -static void set_fpu_round_neg_inf( struct aos_compilation *cp ) -{ - if (cp->fpucntl != FPU_RND_NEG) { - cp->fpucntl = FPU_RND_NEG; - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_rnd_neg_inf))); - } -} - -static void set_fpu_round_nearest( struct aos_compilation *cp ) -{ - if (cp->fpucntl != FPU_RND_NEAREST) { - cp->fpucntl = FPU_RND_NEAREST; - x87_fnclex(cp->func); - x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, fpu_rnd_nearest))); - } -} - -#if 0 -static void x87_emit_ex2( struct aos_compilation *cp ) -{ - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - int stack = cp->func->x87_stack; - - /* set_fpu_round_neg_inf( cp ); */ - - x87_fld(cp->func, st0); /* a a */ - x87_fprndint( cp->func ); /* int(a) a*/ - x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */ - x87_fxch(cp->func, st1); /* frc(a) int(a) */ - x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */ - x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */ - x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ - x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */ - /* 2^a int(a) */ - x87_fstp(cp->func, st1); /* 2^a */ - - assert( stack == cp->func->x87_stack); - -} -#endif - -#if 0 -static void PIPE_CDECL print_reg( const char *msg, - const float *reg ) -{ - debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); -} -#endif - -#if 0 -static void emit_print( struct aos_compilation *cp, - const char *message, /* must point to a static string! */ - unsigned file, - unsigned idx ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); - unsigned i; - - /* There shouldn't be anything on the x87 stack. Can add this - * capacity later if need be. - */ - assert(cp->func->x87_stack == 0); - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. We're obviously not concerned about performance on this - * debug path, so here goes: - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - - /* Push the arguments: - */ - x86_lea( cp->func, ecx, arg ); - x86_push( cp->func, ecx ); - x86_push_imm32( cp->func, (int)message ); - - /* Call the helper. Could call debug_printf directly, but - * print_reg is a nice place to put a breakpoint if need be. - */ - x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); - x86_call( cp->func, ecx ); - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - - /* Pop caller-save regs - */ - x86_cdecl_caller_pop_regs( cp->func ); - - /* Done... - */ -} -#endif - -/** - * The traditional instructions. All operate on internal registers - * and ignore write masks and swizzling issues. - */ - -static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, arg0); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, tmp, arg0); - - store_dest(cp, &op->Dst[0], tmp); - return TRUE; -} - -static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_addps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_fcos(cp->func); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - -/* The dotproduct instructions don't really do that well in sse: - * XXX: produces wrong results -- disabled. - */ -static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - /* Now the hard bit: sum the first 3 values: - */ - sse_movhlps(cp->func, tmp, dst); - sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - /* Now the hard bit: sum the values: - */ - sse_movhlps(cp->func, tmp, dst); - sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - /* Now the hard bit: sum the values (from DP3): - */ - sse_movhlps(cp->func, tmp, dst); - sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ - emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); - sse_addss(cp->func, dst, tmp); - emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); - sse_addss(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - -/* dst[0] = 1.0 * 1.0F; */ -/* dst[1] = arg0[1] * arg1[1]; */ -/* dst[2] = arg0[2] * 1.0; */ -/* dst[3] = 1.0 * arg1[3]; */ - - emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); - emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); - sse_mulps(cp->func, dst, tmp); - - aos_release_xmm_reg(cp, tmp.idx); - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld1(cp->func); /* 1 */ - x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */ - x87_fyl2x(cp->func); /* log2(a0) */ - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - -#if 0 -static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_emit_ex2(cp); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} -#endif - - -static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_neg_inf( cp ); - - /* Load all sources first to avoid aliasing - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<<i)) { - x87_fld_src(cp, &op->Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<<i)) { - x87_fprndint( cp->func ); - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - -static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_nearest( cp ); - - /* Load all sources first to avoid aliasing - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<<i)) { - x87_fld_src(cp, &op->Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<<i)) { - x87_fprndint( cp->func ); - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - -static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - unsigned writemask = op->Dst[0].Register.WriteMask; - int i; - - set_fpu_round_neg_inf( cp ); - - /* suck all the source values onto the stack before writing out any - * dst, which may alias... - */ - for (i = 3; i >= 0; i--) { - if (writemask & (1<<i)) { - x87_fld_src(cp, &op->Src[0], i); - } - } - - for (i = 0; i < 4; i++) { - if (writemask & (1<<i)) { - x87_fld(cp->func, st0); /* a a */ - x87_fprndint( cp->func ); /* flr(a) a */ - x87_fsubp(cp->func, st1); /* frc(a) */ - x87_fstp(cp->func, x86_make_disp(dst, i*4)); - } - } - - return TRUE; -} - - - - - - -static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - unsigned writemask = op->Dst[0].Register.WriteMask; - unsigned lit_count = cp->lit_count++; - struct x86_reg result, arg0; - unsigned i; - -#if 1 - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } -#endif - - if (writemask != TGSI_WRITEMASK_XYZW) - result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); - else - result = get_dst_ptr(cp, &op->Dst[0]); - - - arg0 = fetch_src( cp, &op->Src[0] ); - if (arg0.file == file_XMM) { - struct x86_reg tmp = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, tmp[1])); - sse_movaps( cp->func, tmp, arg0 ); - arg0 = tmp; - } - - - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - /* Push the arguments: - */ - x86_push_imm32( cp->func, lit_count ); - - x86_lea( cp->func, ecx, arg0 ); - x86_push( cp->func, ecx ); - - x86_lea( cp->func, ecx, result ); - x86_push( cp->func, ecx ); - - x86_push( cp->func, cp->machine_EDX ); - - if (lit_count < MAX_LIT_INFO) { - x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, - Offset(struct aos_machine, lit_info) + - lit_count * sizeof(struct lit_info) + - Offset(struct lit_info, func))); - } - else { - x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit ); - } - - x86_call( cp->func, ecx ); - - x86_pop( cp->func, ecx ); /* fixme... */ - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - x86_pop( cp->func, ecx ); - - x86_cdecl_caller_pop_regs( cp->func ); - - if (writemask != TGSI_WRITEMASK_XYZW) { - store_dest( cp, - &op->Dst[0], - get_xmm_writable( cp, result ) ); - } - - return TRUE; -} - -#if 0 -static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); - unsigned writemask = op->Dst[0].Register.WriteMask; - - if (writemask & TGSI_WRITEMASK_YZ) { - struct x86_reg st1 = x86_make_reg(file_x87, 1); - struct x86_reg st2 = x86_make_reg(file_x87, 2); - - /* a1' = a1 <= 0 ? 1 : a1; - */ - x87_fldz(cp->func); /* 1 0 */ -#if 1 - x87_fld1(cp->func); /* 1 0 */ -#else - /* Correct but slow due to fp exceptions generated in fyl2x - fix me. - */ - x87_fldz(cp->func); /* 1 0 */ -#endif - x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */ - x87_fcomi(cp->func, st2); /* a1 1 0 */ - x87_fcmovb(cp->func, st1); /* a1' 1 0 */ - x87_fstp(cp->func, st1); /* a1' 0 */ - x87_fstp(cp->func, st1); /* a1' */ - - x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */ - x87_fxch(cp->func, st1); /* a1' a3 */ - - - /* Compute pow(a1, a3) - */ - x87_fyl2x(cp->func); /* a3*log2(a1) */ - x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */ - - - /* a0' = max2(a0, 0): - */ - x87_fldz(cp->func); /* 0 r2 */ - x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */ - x87_fcomi(cp->func, st1); - x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ - - x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ - - x87_fcomi(cp->func, st1); /* a0' 0 r2 */ - x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ - - x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ - x87_fpop(cp->func); /* r2 */ - x87_fpop(cp->func); - } - - if (writemask & TGSI_WRITEMASK_XW) { - x87_fld1(cp->func); - x87_fst_or_nop(cp->func, writemask, 0, dst); - x87_fstp_or_pop(cp->func, writemask, 3, dst); - } - - return TRUE; -} -#endif - - - -static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_maxps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_minps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - /* potentially nothing to do */ - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg arg2 = fetch_src(cp, &op->Src[2]); - - /* If we can't clobber old contents of arg0, get a temporary & copy - * it there, then clobber it... - */ - arg0 = get_xmm_writable(cp, arg0); - - sse_mulps(cp->func, arg0, arg1); - sse_addps(cp->func, arg0, arg2); - store_dest(cp, &op->Dst[0], arg0); - return TRUE; -} - - - -/* A wrapper for powf(). - * Makes sure it is cdecl and operates on floats. - */ -static float PIPE_CDECL _powerf( float x, float y ) -{ -#if FAST_MATH - return util_fast_pow(x, y); -#else - return powf( x, y ); -#endif -} - -#if FAST_MATH -static float PIPE_CDECL _exp2(float x) -{ - return util_fast_exp2(x); -} -#endif - - -/* Really not sufficient -- need to check for conditions that could - * generate inf/nan values, which will slow things down hugely. - */ -static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ -#if 0 - x87_fld_src(cp, &op->Src[1], 0); /* a1.x */ - x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */ - x87_fyl2x(cp->func); /* a1*log2(a0) */ - - x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */ - - x87_fstp_dest4(cp, &op->Dst[0]); -#else - uint i; - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) ); - - x87_fld_src( cp, &op->Src[1], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) ); - x87_fld_src( cp, &op->Src[0], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); - - /* tmp_EAX has been pushed & will be restored below */ - x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf ); - x86_call( cp->func, cp->tmp_EAX ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) ); - - x86_cdecl_caller_pop_regs( cp->func ); - - /* Note retval on x87 stack: - */ - cp->func->x87_stack++; - - x87_fstp_dest4( cp, &op->Dst[0] ); -#endif - return TRUE; -} - - -#if FAST_MATH -static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - uint i; - - /* For absolute correctness, need to spill/invalidate all XMM regs - * too. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].dirty) - spill(cp, i); - aos_release_xmm_reg(cp, i); - } - - /* Push caller-save (ie scratch) regs. - */ - x86_cdecl_caller_push_regs( cp->func ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); - - x87_fld_src( cp, &op->Src[0], 0 ); - x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); - - /* tmp_EAX has been pushed & will be restored below */ - x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); - x86_call( cp->func, cp->tmp_EAX ); - - x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); - - x86_cdecl_caller_pop_regs( cp->func ); - - /* Note retval on x87 stack: - */ - cp->func->x87_stack++; - - x87_fstp_dest4( cp, &op->Dst[0] ); - - return TRUE; -} -#endif - - -static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg dst = aos_get_xmm_reg(cp); - - if (cp->have_sse2) { - sse2_rcpss(cp->func, dst, arg0); - /* extend precision here... - */ - } - else { - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - sse_movss(cp->func, dst, ones); - sse_divss(cp->func, dst, arg0); - } - - store_scalar_dest(cp, &op->Dst[0], dst); - return TRUE; -} - - -/* Although rsqrtps() and rcpps() are low precision on some/all SSE - * implementations, it is possible to improve its precision at - * fairly low cost, using a newton/raphson step, as below: - * - * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) - * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] - * or: - * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] - * - * - * See: http://softwarecommunity.intel.com/articles/eng/1818.htm - */ -static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - if (0) { - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg r = aos_get_xmm_reg(cp); - sse_rsqrtss(cp->func, r, arg0); - store_scalar_dest(cp, &op->Dst[0], r); - return TRUE; - } - else { - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg r = aos_get_xmm_reg(cp); - - struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); - struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); - struct x86_reg src = get_xmm_writable( cp, arg0 ); - struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg tmp = aos_get_xmm_reg(cp); - - sse_movaps(cp->func, tmp, src); - sse_mulps(cp->func, tmp, neg); - sse_maxps(cp->func, tmp, src); - - sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */ - sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */ - sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */ - sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */ - sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */ - sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */ - - store_scalar_dest(cp, &op->Dst[0], r); - - aos_release_xmm_reg(cp, tmp.idx); - - return TRUE; - } -} - - -static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); - sse_andps(cp->func, dst, ones); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - x87_fld_src(cp, &op->Src[0], 0); - x87_fsin(cp->func); - x87_fstp_dest4(cp, &op->Dst[0]); - return TRUE; -} - - - -static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_cmpps(cp->func, dst, arg1, cc_LessThan); - sse_andps(cp->func, dst, ones); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg dst = get_xmm_writable(cp, arg0); - - sse_subps(cp->func, dst, arg1); - - store_dest(cp, &op->Dst[0], dst); - return TRUE; -} - -static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg tmp0 = aos_get_xmm_reg(cp); - - sse2_cvttps2dq(cp->func, tmp0, arg0); - sse2_cvtdq2ps(cp->func, tmp0, tmp0); - - store_dest(cp, &op->Dst[0], tmp0); - return TRUE; -} - -static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); - struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); - struct x86_reg tmp0 = aos_get_xmm_reg(cp); - struct x86_reg tmp1 = aos_get_xmm_reg(cp); - - emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); - sse_mulps(cp->func, tmp1, arg0); - emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); - sse_mulps(cp->func, tmp0, arg1); - sse_subps(cp->func, tmp1, tmp0); - sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); - -/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ -/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ -/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ -/* dst[3] is undef */ - - - aos_release_xmm_reg(cp, tmp0.idx); - store_dest(cp, &op->Dst[0], tmp1); - return TRUE; -} - - - -static boolean -emit_instruction( struct aos_compilation *cp, - struct tgsi_full_instruction *inst ) -{ - x87_assert_stack_empty(cp->func); - - switch( inst->Instruction.Opcode ) { - case TGSI_OPCODE_MOV: - return emit_MOV( cp, inst ); - - case TGSI_OPCODE_LIT: - return emit_LIT(cp, inst); - - case TGSI_OPCODE_RCP: - return emit_RCP(cp, inst); - - case TGSI_OPCODE_RSQ: - return emit_RSQ(cp, inst); - - case TGSI_OPCODE_EXP: - /*return emit_EXP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_LOG: - /*return emit_LOG(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_MUL: - return emit_MUL(cp, inst); - - case TGSI_OPCODE_ADD: - return emit_ADD(cp, inst); - - case TGSI_OPCODE_DP3: - return emit_DP3(cp, inst); - - case TGSI_OPCODE_DP4: - return emit_DP4(cp, inst); - - case TGSI_OPCODE_DST: - return emit_DST(cp, inst); - - case TGSI_OPCODE_MIN: - return emit_MIN(cp, inst); - - case TGSI_OPCODE_MAX: - return emit_MAX(cp, inst); - - case TGSI_OPCODE_SLT: - return emit_SLT(cp, inst); - - case TGSI_OPCODE_SGE: - return emit_SGE(cp, inst); - - case TGSI_OPCODE_MAD: - return emit_MAD(cp, inst); - - case TGSI_OPCODE_SUB: - return emit_SUB(cp, inst); - - case TGSI_OPCODE_LRP: - /*return emit_LERP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_FRC: - return emit_FRC(cp, inst); - - case TGSI_OPCODE_CLAMP: - /*return emit_CLAMP(cp, inst);*/ - return FALSE; - - case TGSI_OPCODE_FLR: - return emit_FLR(cp, inst); - - case TGSI_OPCODE_ROUND: - return emit_RND(cp, inst); - - case TGSI_OPCODE_EX2: -#if FAST_MATH - return emit_EXPBASE2(cp, inst); -#elif 0 - /* this seems to fail for "larger" exponents. - * See glean tvertProg1's EX2 test. - */ - return emit_EX2(cp, inst); -#else - return FALSE; -#endif - - case TGSI_OPCODE_LG2: - return emit_LG2(cp, inst); - - case TGSI_OPCODE_POW: - return emit_POW(cp, inst); - - case TGSI_OPCODE_XPD: - return emit_XPD(cp, inst); - - case TGSI_OPCODE_ABS: - return emit_ABS(cp, inst); - - case TGSI_OPCODE_DPH: - return emit_DPH(cp, inst); - - case TGSI_OPCODE_COS: - return emit_COS(cp, inst); - - case TGSI_OPCODE_SIN: - return emit_SIN(cp, inst); - - case TGSI_OPCODE_TRUNC: - return emit_TRUNC(cp, inst); - - case TGSI_OPCODE_END: - return TRUE; - - default: - return FALSE; - } -} - - -static boolean emit_viewport( struct aos_compilation *cp ) -{ - struct x86_reg pos = aos_get_shader_reg_xmm(cp, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output ); - - struct x86_reg scale = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, scale)); - - struct x86_reg translate = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, translate)); - - sse_mulps(cp->func, pos, scale); - sse_addps(cp->func, pos, translate); - - aos_adopt_xmm_reg( cp, - pos, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output, - TRUE ); - return TRUE; -} - - -/* This is useful to be able to see the results on softpipe. Doesn't - * do proper clipping, just assumes the backend can do it during - * rasterization -- for debug only... - */ -static boolean emit_rhw_viewport( struct aos_compilation *cp ) -{ - struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg pos = aos_get_shader_reg_xmm(cp, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output); - - struct x86_reg scale = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, scale)); - - struct x86_reg translate = x86_make_disp(cp->machine_EDX, - Offset(struct aos_machine, translate)); - - - - emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); - sse2_rcpss(cp->func, tmp, tmp); - sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); - - sse_mulps(cp->func, pos, scale); - sse_mulps(cp->func, pos, tmp); - sse_addps(cp->func, pos, translate); - - /* Set pos[3] = w - */ - mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); - - aos_adopt_xmm_reg( cp, - pos, - TGSI_FILE_OUTPUT, - cp->vaos->draw->vs.position_output, - TRUE ); - return TRUE; -} - - -#if 0 -static boolean note_immediate( struct aos_compilation *cp, - struct tgsi_full_immediate *imm ) -{ - unsigned pos = cp->num_immediates++; - unsigned j; - - assert( imm->Immediate.NrTokens <= 4 + 1 ); - for (j = 0; j < imm->Immediate.NrTokens - 1; j++) { - cp->vaos->machine->immediate[pos][j] = imm->u[j].Float; - } - - return TRUE; -} -#endif - - - - -static void find_last_write_outputs( struct aos_compilation *cp ) -{ - struct tgsi_parse_context parse; - unsigned this_instruction = 0; - unsigned i; - - tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); - - while (!tgsi_parse_end_of_tokens( &parse )) { - - tgsi_parse_token( &parse ); - - if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) - continue; - - for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { - if (parse.FullToken.FullInstruction.Dst[i].Register.File == - TGSI_FILE_OUTPUT) - { - unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index; - cp->output_last_write[idx] = this_instruction; - } - } - - this_instruction++; - } - - tgsi_parse_free( &parse ); -} - - -#define ARG_MACHINE 1 -#define ARG_START_ELTS 2 -#define ARG_COUNT 3 -#define ARG_OUTBUF 4 - - -static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant, - boolean linear ) -{ - struct tgsi_parse_context parse; - struct aos_compilation cp; - unsigned fixup, label; - - util_init_math(); - - tgsi_parse_init( &parse, variant->base.vs->state.tokens ); - - memset(&cp, 0, sizeof(cp)); - - cp.insn_counter = 1; - cp.vaos = variant; - cp.have_sse2 = 1; - cp.func = &variant->func[ linear ? 0 : 1 ]; - - cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); - cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); - cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); - cp.count_ESI = x86_make_reg(file_REG32, reg_SI); - cp.temp_EBP = x86_make_reg(file_REG32, reg_BP); - cp.stack_ESP = x86_make_reg( file_REG32, reg_SP ); - - x86_init_func(cp.func); - - find_last_write_outputs(&cp); - - x86_push(cp.func, cp.idx_EBX); - x86_push(cp.func, cp.count_ESI); - x86_push(cp.func, cp.temp_EBP); - - - /* Load arguments into regs: - */ - x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE)); - x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); - x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); - x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); - - - /* Compare count to zero and possibly bail. - */ - x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); - x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); - fixup = x86_jcc_forward(cp.func, cc_E); - - - save_fpu_state( &cp ); - set_fpu_round_nearest( &cp ); - - aos_init_inputs( &cp, linear ); - - cp.x86_reg[0] = 0; - cp.x86_reg[1] = 0; - - /* Note address for loop jump - */ - label = x86_get_label(cp.func); - { - /* Fetch inputs... TODO: fetch lazily... - */ - if (!aos_fetch_inputs( &cp, linear )) - goto fail; - - /* Emit the shader: - */ - while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) - { - tgsi_parse_token( &parse ); - - switch (parse.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_IMMEDIATE: -#if 0 - if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) - goto fail; -#endif - break; - - case TGSI_TOKEN_TYPE_INSTRUCTION: - if (DISASSEM) - tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); - - if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) - goto fail; - break; - } - - x87_assert_stack_empty(cp.func); - cp.insn_counter++; - - if (DISASSEM) - debug_printf("\n"); - } - - - { - unsigned i; - for (i = 0; i < 8; i++) { - if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { - cp.xmm[i].file = TGSI_FILE_NULL; - cp.xmm[i].dirty = 0; - } - } - } - - if (cp.error) - goto fail; - - if (cp.vaos->base.key.clip) { - /* not really handling clipping, just do the rhw so we can - * see the results... - */ - emit_rhw_viewport(&cp); - } - else if (cp.vaos->base.key.viewport) { - emit_viewport(&cp); - } - - /* Emit output... TODO: do this eagerly after the last write to a - * given output. - */ - if (!aos_emit_outputs( &cp )) - goto fail; - - - /* Next vertex: - */ - x86_lea(cp.func, - cp.outbuf_ECX, - x86_make_disp(cp.outbuf_ECX, - cp.vaos->base.key.output_stride)); - - /* Incr index - */ - aos_incr_inputs( &cp, linear ); - } - /* decr count, loop if not zero - */ - x86_dec(cp.func, cp.count_ESI); - x86_jcc(cp.func, cc_NZ, label); - - restore_fpu_state(&cp); - - /* Land forward jump here: - */ - x86_fixup_fwd_jump(cp.func, fixup); - - /* Exit mmx state? - */ - if (cp.func->need_emms) - mmx_emms(cp.func); - - x86_pop(cp.func, cp.temp_EBP); - x86_pop(cp.func, cp.count_ESI); - x86_pop(cp.func, cp.idx_EBX); - - x87_assert_stack_empty(cp.func); - x86_ret(cp.func); - - tgsi_parse_free( &parse ); - return !cp.error; - - fail: - tgsi_parse_free( &parse ); - return FALSE; -} - - -/** cast wrapper */ -static INLINE struct draw_vs_variant_aos_sse * -draw_vs_variant_aos_sse(struct draw_vs_variant *variant) -{ - return (struct draw_vs_variant_aos_sse *) variant; -} - - -static void vaos_set_buffer( struct draw_vs_variant *variant, - unsigned buf, - const void *ptr, - unsigned stride, - unsigned max_stride) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - - if (buf < vaos->nr_vb) { - vaos->buffer[buf].base_ptr = (char *)ptr; - vaos->buffer[buf].stride = stride; - } - - if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); -} - - - -static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant, - const unsigned *elts, - unsigned count, - void *output_buffer ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - struct aos_machine *machine = vaos->draw->vs.aos_machine; - unsigned i; - - if (0) debug_printf("%s %d\n", __FUNCTION__, count); - - machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->constants[i] = vaos->draw->vs.aligned_constants[i]; - } - machine->immediates = vaos->base.vs->immediates; - machine->buffer = vaos->buffer; - - vaos->gen_run_elts( machine, - elts, - count, - output_buffer ); -} - -static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant, - unsigned start, - unsigned count, - void *output_buffer ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - struct aos_machine *machine = vaos->draw->vs.aos_machine; - unsigned i; - - if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, - vaos->base.key.const_vbuffers); - - machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->constants[i] = vaos->draw->vs.aligned_constants[i]; - } - machine->immediates = vaos->base.vs->immediates; - machine->buffer = vaos->buffer; - - vaos->gen_run_linear( machine, - start, - count, - output_buffer ); - - /* Sanity spot checks to make sure we didn't trash our constants */ - assert(machine->internal[IMM_ONES][0] == 1.0f); - assert(machine->internal[IMM_IDENTITY][0] == 0.0f); - assert(machine->internal[IMM_NEGS][0] == -1.0f); -} - - - -static void vaos_destroy( struct draw_vs_variant *variant ) -{ - struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); - - FREE( vaos->buffer ); - - x86_release_func( &vaos->func[0] ); - x86_release_func( &vaos->func[1] ); - - FREE(vaos); -} - - - -static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ) -{ - unsigned i; - struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse); - - if (!vaos) - goto fail; - - vaos->base.key = *key; - vaos->base.vs = vs; - vaos->base.set_buffer = vaos_set_buffer; - vaos->base.destroy = vaos_destroy; - vaos->base.run_linear = vaos_run_linear; - vaos->base.run_elts = vaos_run_elts; - - vaos->draw = vs->draw; - - for (i = 0; i < key->nr_inputs; i++) - vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); - - vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); - if (!vaos->buffer) - goto fail; - - if (0) - debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); - -#if 0 - tgsi_dump(vs->state.tokens, 0); -#endif - - if (!build_vertex_program( vaos, TRUE )) - goto fail; - - if (!build_vertex_program( vaos, FALSE )) - goto fail; - - vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]); - if (!vaos->gen_run_linear) - goto fail; - - vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]); - if (!vaos->gen_run_elts) - goto fail; - - return &vaos->base; - - fail: - if (vaos && vaos->buffer) - FREE(vaos->buffer); - - if (vaos) - x86_release_func( &vaos->func[0] ); - - if (vaos) - x86_release_func( &vaos->func[1] ); - - FREE(vaos); - - return NULL; -} - - -struct draw_vs_variant * -draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs, - const struct draw_vs_variant_key *key ) -{ - struct draw_vs_variant *variant = variant_aos_sse( vs, key ); - - if (variant == NULL) { - variant = draw_vs_create_variant_generic( vs, key ); - } - - return variant; -} - - - -#endif /* PIPE_ARCH_X86 */ diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h deleted file mode 100644 index 55e63d8b9fa..00000000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ /dev/null @@ -1,255 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* Authors: Keith Whitwell <[email protected]> - */ - -#ifndef DRAW_VS_AOS_H -#define DRAW_VS_AOS_H - -#include "pipe/p_config.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" - -#ifdef PIPE_ARCH_X86 - -struct tgsi_token; -struct x86_function; - -#include "pipe/p_state.h" -#include "rtasm/rtasm_x86sse.h" - - - - - -#define X 0 -#define Y 1 -#define Z 2 -#define W 3 - -#define MAX_INPUTS PIPE_MAX_ATTRIBS -#define MAX_OUTPUTS PIPE_MAX_SHADER_OUTPUTS -#define MAX_TEMPS TGSI_EXEC_NUM_TEMPS -#define MAX_CONSTANTS 1024 /** only used for sanity checking */ -#define MAX_IMMEDIATES 1024 /** only used for sanity checking */ -#define MAX_INTERNALS 8 /** see IMM_x values below */ - -#define AOS_FILE_INTERNAL TGSI_FILE_COUNT - -#define FPU_RND_NEG 1 -#define FPU_RND_NEAREST 2 - -struct aos_machine; -typedef void (PIPE_CDECL *lit_func)( struct aos_machine *, - float *result, - const float *in, - unsigned count ); - -void PIPE_CDECL aos_do_lit( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ); - -struct shine_tab { - float exponent; - float values[258]; - unsigned last_used; -}; - -struct lit_info { - lit_func func; - struct shine_tab *shine_tab; -}; - -#define MAX_SHINE_TAB 4 -#define MAX_LIT_INFO 16 - -struct aos_buffer { - const void *base_ptr; - unsigned stride; - void *ptr; /* updated per vertex */ -}; - - - - -/* This is the temporary storage used by all the aos_sse vs variants. - * Create one per context and reuse by passing a pointer in at - * vs_variant creation?? - */ -struct aos_machine { - float input [MAX_INPUTS ][4]; - float output [MAX_OUTPUTS ][4]; - float temp [MAX_TEMPS ][4]; - float internal [MAX_INTERNALS ][4]; - - float scale[4]; /* viewport */ - float translate[4]; /* viewport */ - - float tmp[2][4]; /* scratch space for LIT */ - - struct shine_tab shine_tab[MAX_SHINE_TAB]; - struct lit_info lit_info[MAX_LIT_INFO]; - unsigned now; - - - ushort fpu_rnd_nearest; - ushort fpu_rnd_neg_inf; - ushort fpu_restore; - ushort fpucntl; /* one of FPU_* above */ - - const float (*immediates)[4]; /* points to shader data */ - const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */ - - const struct aos_buffer *buffer; /* points to ? */ -}; - - - - -struct aos_compilation { - struct x86_function *func; - struct draw_vs_variant_aos_sse *vaos; - - unsigned insn_counter; - unsigned num_immediates; - unsigned count; - unsigned lit_count; - - struct { - unsigned idx:16; - unsigned file:8; - unsigned dirty:8; - unsigned last_used; - } xmm[8]; - - unsigned x86_reg[2]; /* one of X86_* */ - - boolean input_fetched[PIPE_MAX_ATTRIBS]; - unsigned output_last_write[PIPE_MAX_ATTRIBS]; - - boolean have_sse2; - boolean error; - short fpucntl; - - /* these are actually known values, but putting them in a struct - * like this is helpful to keep them in sync across the file. - */ - struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ - struct x86_reg temp_EBP; - struct x86_reg stack_ESP; -}; - -struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ); -void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ); - -void aos_adopt_xmm_reg( struct aos_compilation *cp, - struct x86_reg reg, - unsigned file, - unsigned idx, - unsigned dirty ); - -void aos_spill_all( struct aos_compilation *cp ); - -struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, - unsigned file, - unsigned idx ); - -boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ); -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ); -boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ); - -boolean aos_emit_outputs( struct aos_compilation *cp ); - - -#define IMM_ONES 0 /* 1, 1,1,1 */ -#define IMM_SWZ 1 /* 1,-1,0, 0xffffffff */ -#define IMM_IDENTITY 2 /* 0, 0,0,1 */ -#define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */ -#define IMM_255 4 /* 255, 255, 255, 255 */ -#define IMM_NEGS 5 /* -1,-1,-1,-1 */ -#define IMM_RSQ 6 /* -.5,1.5,_,_ */ -#define IMM_PSIZE 7 /* not really an immediate - updated each run */ - -struct x86_reg aos_get_internal( struct aos_compilation *cp, - unsigned imm ); -struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, - unsigned imm ); - - -#define AOS_ERROR(cp, msg) \ -do { \ - if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \ - cp->error = 1; \ -} while (0) - - -#define X86_NULL 0 -#define X86_IMMEDIATES 1 -#define X86_CONSTANTS 2 -#define X86_BUFFERS 3 - -struct x86_reg aos_get_x86( struct aos_compilation *cp, - unsigned which_reg, - unsigned value ); - - -typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *, - const unsigned *elts, - unsigned count, - void *output_buffer); - -typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *, - unsigned start, - unsigned count, - void *output_buffer); - - -struct draw_vs_variant_aos_sse { - struct draw_vs_variant base; - struct draw_context *draw; - - struct aos_buffer *buffer; - unsigned nr_vb; - - vaos_run_linear_func gen_run_linear; - vaos_run_elts_func gen_run_elts; - - - struct x86_function func[2]; -}; - - -#endif - -#endif - diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c deleted file mode 100644 index f1dd4487732..00000000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ /dev/null @@ -1,460 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "util/u_memory.h" -#include "pipe/p_shader_tokens.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" -#include "draw_vs_aos.h" -#include "draw_vertex.h" - -#include "rtasm/rtasm_x86sse.h" - -#ifdef PIPE_ARCH_X86 - -/* Note - don't yet have to worry about interacting with the code in - * draw_vs_aos.c as there is no intermingling of generated code... - * That may have to change, we'll see. - */ -static void emit_load_R32G32B32A32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movups(cp->func, data, src_ptr); -} - -static void emit_load_R32G32B32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ -#if 1 - sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); - /* data = z ? ? ? */ - sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); - /* data = z ? 0 1 */ - sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); - /* data = ? 0 z 1 */ - sse_movlps(cp->func, data, src_ptr); - /* data = x y z 1 */ -#else - sse_movups(cp->func, data, src_ptr); - /* data = x y z ? */ - sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) ); - /* data = ? x y z */ - sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) ); - /* data = 1 x y z */ - sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) ); - /* data = x y z 1 */ -#endif -} - -static void emit_load_R32G32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); - sse_movlps(cp->func, data, src_ptr); -} - - -static void emit_load_R32( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movss(cp->func, data, src_ptr); - sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); -} - - -static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, - struct x86_reg data, - struct x86_reg src_ptr ) -{ - sse_movss(cp->func, data, src_ptr); - sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); - sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); - sse2_cvtdq2ps(cp->func, data, data); - sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); -} - - - -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct aos_compilation *cp, - struct x86_reg dest, - struct x86_reg src, - ubyte shuffle ) -{ - sse_shufps(cp->func, dest, src, shuffle); -} - - - -static boolean get_buffer_ptr( struct aos_compilation *cp, - boolean linear, - unsigned buf_idx, - struct x86_reg elt, - struct x86_reg ptr) -{ - struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - buf_idx * sizeof(struct aos_buffer)); - - struct x86_reg buf_stride = x86_make_disp(buf, - Offset(struct aos_buffer, stride)); - if (linear) { - struct x86_reg buf_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_ptr); - x86_mov(cp->func, elt, buf_stride); - x86_add(cp->func, elt, ptr); - if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192)); - x86_mov(cp->func, buf_ptr, elt); - } - else { - struct x86_reg buf_base_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, base_ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_stride); - x86_imul(cp->func, ptr, elt); - x86_add(cp->func, ptr, buf_base_ptr); - } - - cp->insn_counter++; - - return TRUE; -} - - -static boolean load_input( struct aos_compilation *cp, - unsigned idx, - struct x86_reg bufptr ) -{ - unsigned format = cp->vaos->base.key.element[idx].in.format; - unsigned offset = cp->vaos->base.key.element[idx].in.offset; - struct x86_reg dataXMM = aos_get_xmm_reg(cp); - - /* Figure out source pointer address: - */ - struct x86_reg src = x86_make_disp(bufptr, offset); - - aos_adopt_xmm_reg( cp, - dataXMM, - TGSI_FILE_INPUT, - idx, - TRUE ); - - switch (format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(cp, dataXMM, src); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(cp, dataXMM, src); - break; - case PIPE_FORMAT_A8R8G8B8_UNORM: - emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); - emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); - break; - default: - AOS_ERROR(cp, "unhandled input format"); - return FALSE; - } - - return TRUE; -} - -static boolean load_inputs( struct aos_compilation *cp, - unsigned buffer, - struct x86_reg ptr ) -{ - unsigned i; - - for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { - if (cp->vaos->base.key.element[i].in.buffer == buffer) { - - if (!load_input( cp, i, ptr )) - return FALSE; - - cp->insn_counter++; - } - } - - return TRUE; -} - -boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ) -{ - unsigned i; - for (i = 0; i < cp->vaos->nr_vb; i++) { - struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - i * sizeof(struct aos_buffer)); - - struct x86_reg buf_base_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, base_ptr)); - - if (cp->vaos->base.key.const_vbuffers & (1<<i)) { - struct x86_reg ptr = cp->tmp_EAX; - - x86_mov(cp->func, ptr, buf_base_ptr); - - /* Load all inputs for this constant vertex buffer - */ - load_inputs( cp, i, x86_deref(ptr) ); - - /* Then just force them out to aos_machine.input[] - */ - aos_spill_all( cp ); - - } - else if (linear) { - - struct x86_reg elt = cp->idx_EBX; - struct x86_reg ptr = cp->tmp_EAX; - - struct x86_reg buf_stride = x86_make_disp(buf, - Offset(struct aos_buffer, stride)); - - struct x86_reg buf_ptr = x86_make_disp(buf, - Offset(struct aos_buffer, ptr)); - - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, ptr, buf_stride); - x86_imul(cp->func, ptr, elt); - x86_add(cp->func, ptr, buf_base_ptr); - - - /* In the linear case, keep the buffer pointer instead of the - * index number. - */ - if (cp->vaos->nr_vb == 1) - x86_mov( cp->func, elt, ptr ); - else - x86_mov( cp->func, buf_ptr, ptr ); - - cp->insn_counter++; - } - } - - return TRUE; -} - -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) -{ - unsigned j; - - for (j = 0; j < cp->vaos->nr_vb; j++) { - if (cp->vaos->base.key.const_vbuffers & (1<<j)) { - /* just retreive pre-transformed input */ - } - else if (linear && cp->vaos->nr_vb == 1) { - load_inputs( cp, 0, cp->idx_EBX ); - } - else { - struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX); - struct x86_reg ptr = cp->tmp_EAX; - - if (!get_buffer_ptr( cp, linear, j, elt, ptr )) - return FALSE; - - if (!load_inputs( cp, j, ptr )) - return FALSE; - } - } - - return TRUE; -} - -boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ) -{ - if (linear && cp->vaos->nr_vb == 1) { - struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), - (0 * sizeof(struct aos_buffer) + - Offset(struct aos_buffer, stride))); - - x86_add(cp->func, cp->idx_EBX, stride); - sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192)); - } - else if (linear) { - /* Nothing to do */ - } - else { - x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4)); - } - - return TRUE; -} - - - - - - -static void emit_store_R32G32B32A32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movups(cp->func, dst_ptr, dataXMM); -} - -static void emit_store_R32G32B32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movlps(cp->func, dst_ptr, dataXMM); - sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM); -} - -static void emit_store_R32G32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movlps(cp->func, dst_ptr, dataXMM); -} - -static void emit_store_R32( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_movss(cp->func, dst_ptr, dataXMM); -} - - - -static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp, - struct x86_reg dst_ptr, - struct x86_reg dataXMM ) -{ - sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255)); - sse2_cvtps2dq(cp->func, dataXMM, dataXMM); - sse2_packssdw(cp->func, dataXMM, dataXMM); - sse2_packuswb(cp->func, dataXMM, dataXMM); - sse_movss(cp->func, dst_ptr, dataXMM); -} - - - - - -static boolean emit_output( struct aos_compilation *cp, - struct x86_reg ptr, - struct x86_reg dataXMM, - enum attrib_emit format ) -{ - switch (format) { - case EMIT_1F: - case EMIT_1F_PSIZE: - emit_store_R32(cp, ptr, dataXMM); - break; - case EMIT_2F: - emit_store_R32G32(cp, ptr, dataXMM); - break; - case EMIT_3F: - emit_store_R32G32B32(cp, ptr, dataXMM); - break; - case EMIT_4F: - emit_store_R32G32B32A32(cp, ptr, dataXMM); - break; - case EMIT_4UB: - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); - break; - case EMIT_4UB_BGRA: - emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); - break; - default: - AOS_ERROR(cp, "unhandled output format"); - return FALSE; - } - - return TRUE; -} - - - -boolean aos_emit_outputs( struct aos_compilation *cp ) -{ - unsigned i; - - for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) { - enum attrib_emit format = cp->vaos->base.key.element[i].out.format; - unsigned offset = cp->vaos->base.key.element[i].out.offset; - unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output; - - struct x86_reg data; - - if (format == EMIT_1F_PSIZE) { - data = aos_get_internal_xmm( cp, IMM_PSIZE ); - } - else { - data = aos_get_shader_reg( cp, - TGSI_FILE_OUTPUT, - vs_output ); - } - - if (data.file != file_XMM) { - struct x86_reg tmp = aos_get_xmm_reg( cp ); - sse_movaps(cp->func, tmp, data); - data = tmp; - } - - if (!emit_output( cp, - x86_make_disp( cp->outbuf_ECX, offset ), - data, - format )) - return FALSE; - - aos_release_xmm_reg( cp, data.idx ); - - cp->insn_counter++; - } - - return TRUE; -} - -#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c deleted file mode 100644 index 0eda414ee6a..00000000000 --- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c +++ /dev/null @@ -1,328 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -#include "pipe/p_config.h" - - -#include "pipe/p_shader_tokens.h" -#include "util/u_math.h" -#include "util/u_memory.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" -#include "draw_vs.h" -#include "draw_vs_aos.h" -#include "draw_vertex.h" - -#ifdef PIPE_ARCH_X86 - -#include "rtasm/rtasm_x86sse.h" - - -#define X87_CW_EXCEPTION_INV_OP (1<<0) -#define X87_CW_EXCEPTION_DENORM_OP (1<<1) -#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) -#define X87_CW_EXCEPTION_OVERFLOW (1<<3) -#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) -#define X87_CW_EXCEPTION_PRECISION (1<<5) -#define X87_CW_PRECISION_SINGLE (0<<8) -#define X87_CW_PRECISION_RESERVED (1<<8) -#define X87_CW_PRECISION_DOUBLE (2<<8) -#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) -#define X87_CW_PRECISION_MASK (3<<8) -#define X87_CW_ROUND_NEAREST (0<<10) -#define X87_CW_ROUND_DOWN (1<<10) -#define X87_CW_ROUND_UP (2<<10) -#define X87_CW_ROUND_ZERO (3<<10) -#define X87_CW_ROUND_MASK (3<<10) -#define X87_CW_INFINITY (1<<12) - - -void PIPE_CDECL aos_do_lit( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - if (in[0] > 0) - { - if (in[1] <= 0.0) - { - result[0] = 1.0F; - result[1] = in[0]; - result[2] = 0.0F; - result[3] = 1.0F; - } - else - { - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); - result[0] = 1.0F; - result[1] = in[0]; - result[2] = powf(in[1], exponent); - result[3] = 1.0; - } - } - else - { - result[0] = 1.0F; - result[1] = 0.0; - result[2] = 0.0; - result[3] = 1.0F; - } -} - - -static void PIPE_CDECL do_lit_lut( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - if (in[0] > 0) - { - if (in[1] <= 0.0) - { - result[0] = 1.0F; - result[1] = in[0]; - result[2] = 0.0F; - result[3] = 1.0F; - return; - } - - if (machine->lit_info[count].shine_tab->exponent != in[3]) { - machine->lit_info[count].func = aos_do_lit; - goto no_luck; - } - - if (in[1] <= 1.0) - { - const float *tab = machine->lit_info[count].shine_tab->values; - float f = in[1] * 256; - int k = (int)f; - float frac = f - (float)k; - - result[0] = 1.0F; - result[1] = in[0]; - result[2] = tab[k] + frac*(tab[k+1]-tab[k]); - result[3] = 1.0; - return; - } - - no_luck: - { - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); - result[0] = 1.0F; - result[1] = in[0]; - result[2] = powf(in[1], exponent); - result[3] = 1.0; - } - } - else - { - result[0] = 1.0F; - result[1] = 0.0; - result[2] = 0.0; - result[3] = 1.0F; - } -} - - -static void do_populate_lut( struct shine_tab *tab, - float unclamped_exponent ) -{ - const float epsilon = 1.0F / 256.0F; - float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon)); - unsigned i; - - tab->exponent = unclamped_exponent; /* for later comparison */ - - tab->values[0] = 0; - if (exponent == 0) { - for (i = 1; i < 258; i++) { - tab->values[i] = 1.0; - } - } - else { - for (i = 1; i < 258; i++) { - tab->values[i] = powf((float)i * epsilon, exponent); - } - } -} - - - - -static void PIPE_CDECL populate_lut( struct aos_machine *machine, - float *result, - const float *in, - unsigned count ) -{ - unsigned i, tab; - - /* Search for an existing table for this value. Note that without - * static analysis we don't really know if in[3] will be constant, - * but it usually is... - */ - for (tab = 0; tab < 4; tab++) { - if (machine->shine_tab[tab].exponent == in[3]) { - goto found; - } - } - - for (tab = 0, i = 1; i < 4; i++) { - if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used) - tab = i; - } - - if (machine->shine_tab[tab].last_used == machine->now) { - /* No unused tables (this is not a ffvertex program...). Just - * call pow each time: - */ - machine->lit_info[count].func = aos_do_lit; - machine->lit_info[count].func( machine, result, in, count ); - return; - } - else { - do_populate_lut( &machine->shine_tab[tab], in[3] ); - } - - found: - machine->shine_tab[tab].last_used = machine->now; - machine->lit_info[count].shine_tab = &machine->shine_tab[tab]; - machine->lit_info[count].func = do_lit_lut; - machine->lit_info[count].func( machine, result, in, count ); -} - - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants) -{ - machine->constants[slot] = constants; - - { - unsigned i; - for (i = 0; i < MAX_LIT_INFO; i++) { - machine->lit_info[i].func = populate_lut; - machine->now++; - } - } -} - - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ) -{ - memcpy(machine->scale, viewport->scale, 4 * sizeof(float)); - memcpy(machine->translate, viewport->translate, 4 * sizeof(float)); -} - - - -void draw_vs_aos_machine_destroy( struct aos_machine *machine ) -{ - align_free(machine); -} - -struct aos_machine *draw_vs_aos_machine( void ) -{ - struct aos_machine *machine; - unsigned i; - float inv = 1.0f/255.0f; - float f255 = 255.0f; - - machine = align_malloc(sizeof(struct aos_machine), 16); - if (!machine) - return NULL; - - memset(machine, 0, sizeof(*machine)); - - ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f); - *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff; - - ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f); - ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f); - ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f); - ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv); - ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255); - ASSIGN_4V(machine->internal[IMM_RSQ], -.5f, 1.5f, 0.0f, 0.0f); - - - machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP | - X87_CW_EXCEPTION_DENORM_OP | - X87_CW_EXCEPTION_ZERO_DIVIDE | - X87_CW_EXCEPTION_OVERFLOW | - X87_CW_EXCEPTION_UNDERFLOW | - X87_CW_EXCEPTION_PRECISION | - (1<<6) | - X87_CW_ROUND_NEAREST | - X87_CW_PRECISION_DOUBLE_EXT); - - assert(machine->fpu_rnd_nearest == 0x37f); - - machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP | - X87_CW_EXCEPTION_DENORM_OP | - X87_CW_EXCEPTION_ZERO_DIVIDE | - X87_CW_EXCEPTION_OVERFLOW | - X87_CW_EXCEPTION_UNDERFLOW | - X87_CW_EXCEPTION_PRECISION | - (1<<6) | - X87_CW_ROUND_DOWN | - X87_CW_PRECISION_DOUBLE_EXT); - - for (i = 0; i < MAX_SHINE_TAB; i++) - do_populate_lut( &machine->shine_tab[i], 1.0f ); - - return machine; -} - -#else - -void draw_vs_aos_machine_viewport( struct aos_machine *machine, - const struct pipe_viewport_state *viewport ) -{ -} - -void -draw_vs_aos_machine_constants(struct aos_machine *machine, - unsigned slot, - const void *constants) -{ -} - -void draw_vs_aos_machine_destroy( struct aos_machine *machine ) -{ -} - -struct aos_machine *draw_vs_aos_machine( void ) -{ - return NULL; -} -#endif - diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c index cf894bbe8af..7fb0e0953e2 100644 --- a/src/gallium/auxiliary/draw/draw_vs_ppc.c +++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c @@ -185,12 +185,7 @@ draw_create_vs_ppc(struct draw_context *draw, tgsi_scan_shader(templ->tokens, &vs->base.info); vs->base.draw = draw; -#if 0 - if (1) - vs->base.create_variant = draw_vs_variant_aos_ppc; - else -#endif - vs->base.create_variant = draw_vs_create_variant_generic; + vs->base.create_variant = draw_vs_create_variant_generic; vs->base.prepare = vs_ppc_prepare; vs->base.run_linear = vs_ppc_run_linear; vs->base.delete = vs_ppc_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c deleted file mode 100644 index d918579bda4..00000000000 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ /dev/null @@ -1,225 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <[email protected]> - * Brian Paul - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "pipe/p_config.h" - -#include "draw_vs.h" - -#if defined(PIPE_ARCH_X86) - -#include "pipe/p_shader_tokens.h" - -#include "draw_private.h" -#include "draw_context.h" - -#include "rtasm/rtasm_cpu.h" -#include "rtasm/rtasm_x86sse.h" -#include "tgsi/tgsi_sse2.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_exec.h" - -#define SSE_MAX_VERTICES 4 - - -struct draw_sse_vertex_shader { - struct draw_vertex_shader base; - struct x86_function sse2_program; - - tgsi_sse2_vs_func func; - - struct tgsi_exec_machine *machine; -}; - - -static void -vs_sse_prepare( struct draw_vertex_shader *base, - struct draw_context *draw ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - struct tgsi_exec_machine *machine = shader->machine; - - machine->Samplers = draw->vs.samplers; - - if (base->info.uses_instanceid) { - unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INSTANCEID]; - assert(i < Elements(machine->SystemValue)); - machine->SystemValue[i][0] = base->draw->instance_id; - } -} - - - -/* Simplified vertex shader interface for the pt paths. Given the - * complexity of code-generating all the above operations together, - * it's time to try doing all the other stuff separately. - */ -static void -vs_sse_run_linear( struct draw_vertex_shader *base, - const float (*input)[4], - float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], - const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], - unsigned count, - unsigned input_stride, - unsigned output_stride ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - struct tgsi_exec_machine *machine = shader->machine; - unsigned int i; - - /* By default, execute all channels. XXX move this inside the loop - * below when we support shader conditionals/loops. - */ - tgsi_set_exec_mask(machine, 1, 1, 1, 1); - - for (i = 0; i < count; i += MAX_TGSI_VERTICES) { - unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); - - if (max_vertices < 4) { - /* disable the unused execution channels */ - tgsi_set_exec_mask(machine, - 1, - max_vertices > 1, - max_vertices > 2, - 0); - } - - /* run compiled shader - */ - shader->func(machine, - (const float (*)[4])constants[0], - shader->base.immediates, - input, - base->info.num_inputs, - input_stride, - output, - base->info.num_outputs, - output_stride ); - - input = (const float (*)[4])((const char *)input + input_stride * max_vertices); - output = (float (*)[4])((char *)output + output_stride * max_vertices); - } -} - - - - -static void -vs_sse_delete( struct draw_vertex_shader *base ) -{ - struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base; - - x86_release_func( &shader->sse2_program ); - - align_free( (void *) shader->base.immediates ); - - FREE( (void*) shader->base.state.tokens ); - FREE( shader ); -} - - -struct draw_vertex_shader * -draw_create_vs_sse(struct draw_context *draw, - const struct pipe_shader_state *templ) -{ - struct draw_sse_vertex_shader *vs; - - if (!rtasm_cpu_has_sse2()) - return NULL; - - vs = CALLOC_STRUCT( draw_sse_vertex_shader ); - if (vs == NULL) - return NULL; - - /* we make a private copy of the tokens */ - vs->base.state.tokens = tgsi_dup_tokens(templ->tokens); - if (!vs->base.state.tokens) - goto fail; - - tgsi_scan_shader(templ->tokens, &vs->base.info); - - vs->base.draw = draw; - if (1) - vs->base.create_variant = draw_vs_create_variant_aos_sse; - else - vs->base.create_variant = draw_vs_create_variant_generic; - vs->base.prepare = vs_sse_prepare; - vs->base.run_linear = vs_sse_run_linear; - vs->base.delete = vs_sse_delete; - - vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * - sizeof(float), 16); - - vs->machine = draw->vs.machine; - - x86_init_func( &vs->sse2_program ); - - if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens, - &vs->sse2_program, - (float (*)[4])vs->base.immediates, - TRUE )) - goto fail; - - vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program ); - if (!vs->func) { - goto fail; - } - - return &vs->base; - -fail: - if (0) - debug_warning("tgsi_emit_sse2() failed, falling back to interpreter\n"); - - x86_release_func( &vs->sse2_program ); - - FREE(vs); - return NULL; -} - - - -#else - -struct draw_vertex_shader * -draw_create_vs_sse( struct draw_context *draw, - const struct pipe_shader_state *templ ) -{ - return (void *) 0; -} - - -#endif - |