summaryrefslogtreecommitdiffstats
path: root/src/gallium/auxiliary/draw
diff options
context:
space:
mode:
authorJosé Fonseca <[email protected]>2011-11-08 00:10:47 +0000
committerJosé Fonseca <[email protected]>2011-11-08 22:57:34 +0000
commit4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 (patch)
tree857d6c1740eb32fc86744f7afd81322862f6150c /src/gallium/auxiliary/draw
parent207a016ecaabbccf865a5b8e026b95a4276adc15 (diff)
Remove tgsi_sse2.
tgsi_exec is simple. llvm is fast. tgsi_sse2 ends up being neither.
Diffstat (limited to 'src/gallium/auxiliary/draw')
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.c27
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.h20
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.c2267
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.h255
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_io.c460
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_machine.c328
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_ppc.c7
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c225
9 files changed, 2 insertions, 3591 deletions
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index b84d2b77179..3521a035e2f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -237,10 +237,6 @@ struct draw_context
uint num_samplers;
struct tgsi_sampler **samplers;
- /* Here's another one:
- */
- struct aos_machine *aos_machine;
-
const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS];
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 1763dbc199f..957bbe57a82 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -81,14 +81,12 @@ draw_vs_set_constants(struct draw_context *draw,
}
draw->vs.aligned_constants[slot] = constants;
- draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants);
}
void draw_vs_set_viewport( struct draw_context *draw,
const struct pipe_viewport_state *viewport )
{
- draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport );
}
@@ -103,22 +101,8 @@ draw_create_vertex_shader(struct draw_context *draw,
tgsi_dump(shader->tokens, 0);
}
- if (!draw->pt.middle.llvm) {
-#if 0
-/* these paths don't support vertex clamping
- * TODO: either add it, or remove them completely
- * use LLVM instead if you want performance
- * use exec instead if you want debugging/more correctness
- */
-#if defined(PIPE_ARCH_X86)
- vs = draw_create_vs_sse( draw, shader );
-#elif defined(PIPE_ARCH_PPC)
- vs = draw_create_vs_ppc( draw, shader );
-#endif
-#endif
- }
#if HAVE_LLVM
- else {
+ if (draw->pt.middle.llvm) {
vs = draw_create_vs_llvm(draw, shader);
}
#endif
@@ -199,12 +183,6 @@ draw_vs_init( struct draw_context *draw )
if (!draw->vs.fetch_cache)
return FALSE;
- draw->vs.aos_machine = draw_vs_aos_machine();
-#ifdef PIPE_ARCH_X86
- if (!draw->vs.aos_machine)
- return FALSE;
-#endif
-
return TRUE;
}
@@ -219,9 +197,6 @@ draw_vs_destroy( struct draw_context *draw )
if (draw->vs.emit_cache)
translate_cache_destroy(draw->vs.emit_cache);
- if (draw->vs.aos_machine)
- draw_vs_aos_machine_destroy(draw->vs.aos_machine);
-
for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
if (draw->vs.aligned_constant_storage[i]) {
align_free((void *)draw->vs.aligned_constant_storage[i]);
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index e6d187e9774..49229f8164b 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -159,10 +159,6 @@ draw_create_vs_exec(struct draw_context *draw,
const struct pipe_shader_state *templ);
struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
- const struct pipe_shader_state *templ);
-
-struct draw_vertex_shader *
draw_create_vs_ppc(struct draw_context *draw,
const struct pipe_shader_state *templ);
@@ -170,10 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw,
struct draw_vs_variant_key;
struct draw_vertex_shader;
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key );
-
#if HAVE_LLVM
struct draw_vertex_shader *
draw_create_vs_llvm(struct draw_context *draw,
@@ -214,18 +206,6 @@ static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key
}
-struct aos_machine *draw_vs_aos_machine( void );
-void draw_vs_aos_machine_destroy( struct aos_machine *machine );
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants);
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport );
-
-
#define MAX_TGSI_VERTICES 4
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
deleted file mode 100644
index 7b90dba0cd5..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ /dev/null
@@ -1,2267 +0,0 @@
-/*
- * Mesa 3-D graphics library
- * Version: 6.3
- *
- * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
- * using the rtasm runtime assembler. Based on the old
- * t_vb_arb_program_sse.c
- */
-
-
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-#define DISASSEM 0
-#define FAST_MATH 1
-
-static const char *files[] =
-{
- "NULL",
- "CONST",
- "IN",
- "OUT",
- "TEMP",
- "SAMP",
- "ADDR",
- "IMM",
- "INTERNAL",
-};
-
-static INLINE boolean eq( struct x86_reg a,
- struct x86_reg b )
-{
- return (a.file == b.file &&
- a.idx == b.idx &&
- a.mod == b.mod &&
- a.disp == b.disp);
-}
-
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
- unsigned which_reg, /* quick hack */
- unsigned value )
-{
- struct x86_reg reg;
-
- if (which_reg == 0)
- reg = cp->temp_EBP;
- else
- reg = cp->tmp_EAX;
-
- if (cp->x86_reg[which_reg] != value) {
- unsigned offset;
-
- switch (value) {
- case X86_IMMEDIATES:
- assert(which_reg == 0);
- offset = Offset(struct aos_machine, immediates);
- break;
- case X86_CONSTANTS:
- assert(which_reg == 1);
- offset = Offset(struct aos_machine, constants);
- break;
- case X86_BUFFERS:
- assert(which_reg == 0);
- offset = Offset(struct aos_machine, buffer);
- break;
- default:
- assert(0);
- offset = 0;
- }
-
-
- x86_mov(cp->func, reg,
- x86_make_disp(cp->machine_EDX, offset));
-
- cp->x86_reg[which_reg] = value;
- }
-
- return reg;
-}
-
-
-static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- struct x86_reg ptr = cp->machine_EDX;
-
- switch (file) {
- case TGSI_FILE_INPUT:
- assert(idx < MAX_INPUTS);
- return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
-
- case TGSI_FILE_OUTPUT:
- return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
-
- case TGSI_FILE_TEMPORARY:
- assert(idx < MAX_TEMPS);
- return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
-
- case AOS_FILE_INTERNAL:
- assert(idx < MAX_INTERNALS);
- return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
-
- case TGSI_FILE_IMMEDIATE:
- assert(idx < MAX_IMMEDIATES); /* just a sanity check */
- return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
-
- case TGSI_FILE_CONSTANT:
- assert(idx < MAX_CONSTANTS); /* just a sanity check */
- return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
-
- default:
- AOS_ERROR(cp, "unknown reg file");
- return x86_make_reg(0,0);
- }
-}
-
-
-
-#define X87_CW_EXCEPTION_INV_OP (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
-#define X87_CW_EXCEPTION_PRECISION (1<<5)
-#define X87_CW_PRECISION_SINGLE (0<<8)
-#define X87_CW_PRECISION_RESERVED (1<<8)
-#define X87_CW_PRECISION_DOUBLE (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
-#define X87_CW_PRECISION_MASK (3<<8)
-#define X87_CW_ROUND_NEAREST (0<<10)
-#define X87_CW_ROUND_DOWN (1<<10)
-#define X87_CW_ROUND_UP (2<<10)
-#define X87_CW_ROUND_ZERO (3<<10)
-#define X87_CW_ROUND_MASK (3<<10)
-#define X87_CW_INFINITY (1<<12)
-
-
-
-
-static void spill( struct aos_compilation *cp, unsigned idx )
-{
- if (!cp->xmm[idx].dirty ||
- (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
- cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
- cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
- AOS_ERROR(cp, "invalid spill");
- return;
- }
- else {
- struct x86_reg oldval = get_reg_ptr(cp,
- cp->xmm[idx].file,
- cp->xmm[idx].idx);
-
- if (0) debug_printf("\nspill %s[%d]",
- files[cp->xmm[idx].file],
- cp->xmm[idx].idx);
-
- assert(cp->xmm[idx].dirty);
- sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
- cp->xmm[idx].dirty = 0;
- }
-}
-
-
-void aos_spill_all( struct aos_compilation *cp )
-{
- unsigned i;
-
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-}
-
-
-static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file != file_XMM ||
- cp->xmm[reg.idx].file != TGSI_FILE_NULL)
- {
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, reg);
- reg = tmp;
- }
-
- cp->xmm[reg.idx].last_used = cp->insn_counter;
- return reg;
-}
-
-static struct x86_reg get_xmm( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file != file_XMM)
- {
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, reg);
- reg = tmp;
- }
-
- cp->xmm[reg.idx].last_used = cp->insn_counter;
- return reg;
-}
-
-
-/* Allocate an empty xmm register, either as a temporary or later to
- * "adopt" as a shader reg.
- */
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
-{
- unsigned i;
- unsigned oldest = 0;
- boolean found = FALSE;
-
- for (i = 0; i < 8; i++)
- if (cp->xmm[i].last_used != cp->insn_counter &&
- cp->xmm[i].file == TGSI_FILE_NULL) {
- oldest = i;
- found = TRUE;
- }
-
- if (!found) {
- for (i = 0; i < 8; i++)
- if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
- oldest = i;
- }
-
- /* Need to write out the old value?
- */
- if (cp->xmm[oldest].dirty)
- spill(cp, oldest);
-
- assert(cp->xmm[oldest].last_used != cp->insn_counter);
-
- cp->xmm[oldest].file = TGSI_FILE_NULL;
- cp->xmm[oldest].idx = 0;
- cp->xmm[oldest].dirty = 0;
- cp->xmm[oldest].last_used = cp->insn_counter;
- return x86_make_reg(file_XMM, oldest);
-}
-
-void aos_release_xmm_reg( struct aos_compilation *cp,
- unsigned idx )
-{
- cp->xmm[idx].file = TGSI_FILE_NULL;
- cp->xmm[idx].idx = 0;
- cp->xmm[idx].dirty = 0;
- cp->xmm[idx].last_used = 0;
-}
-
-
-static void aos_soft_release_xmm( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file == file_XMM) {
- assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
- cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
- }
-}
-
-
-
-/* Mark an xmm reg as holding the current copy of a shader reg.
- */
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
- struct x86_reg reg,
- unsigned file,
- unsigned idx,
- unsigned dirty )
-{
- unsigned i;
-
- if (reg.file != file_XMM) {
- assert(0);
- return;
- }
-
-
- /* If any xmm reg thinks it holds this shader reg, break the
- * illusion.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- /* If an xmm reg is already holding this shader reg, take into account its
- * dirty flag...
- */
- dirty |= cp->xmm[i].dirty;
- aos_release_xmm_reg(cp, i);
- }
- }
-
- cp->xmm[reg.idx].file = file;
- cp->xmm[reg.idx].idx = idx;
- cp->xmm[reg.idx].dirty = dirty;
- cp->xmm[reg.idx].last_used = cp->insn_counter;
-}
-
-
-/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
- */
-static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- unsigned i;
-
- /* Ensure the in-memory copy of this reg is up-to-date
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx &&
- cp->xmm[i].dirty) {
- spill(cp, i);
- }
- }
-
- return get_reg_ptr( cp, file, idx );
-}
-
-
-/* As above, but return a pointer. Note - this pointer may alias
- * those returned by get_arg_ptr().
- */
-static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *dst )
-{
- unsigned file = dst->Register.File;
- unsigned idx = dst->Register.Index;
- unsigned i;
-
-
- /* Ensure in-memory copy of this reg is up-to-date and invalidate
- * any xmm copies.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- if (cp->xmm[i].dirty)
- spill(cp, i);
-
- aos_release_xmm_reg(cp, i);
- }
- }
-
- return get_reg_ptr( cp, file, idx );
-}
-
-
-
-
-
-/* Return an XMM reg if the argument is resident, otherwise return a
- * base+offset pointer to the saved value.
- */
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- unsigned i;
-
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- cp->xmm[i].last_used = cp->insn_counter;
- return x86_make_reg(file_XMM, i);
- }
- }
-
- /* If not found in the XMM register file, return an indirect
- * reference to the in-memory copy:
- */
- return get_reg_ptr( cp, file, idx );
-}
-
-
-
-static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- struct x86_reg reg = get_xmm( cp,
- aos_get_shader_reg( cp, file, idx ) );
-
- aos_adopt_xmm_reg( cp,
- reg,
- file,
- idx,
- FALSE );
-
- return reg;
-}
-
-
-
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
- unsigned imm )
-{
- return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
- unsigned imm )
-{
- return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-
-
-
-/* Emulate pshufd insn in regular SSE, if necessary:
- */
-static void emit_pshufd( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg arg0,
- ubyte shuf )
-{
- if (cp->have_sse2) {
- sse2_pshufd(cp->func, dst, arg0, shuf);
- }
- else {
- if (!eq(dst, arg0))
- sse_movaps(cp->func, dst, arg0);
-
- sse_shufps(cp->func, dst, dst, shuf);
- }
-}
-
-/* load masks (pack into negs??)
- * pshufd - shuffle according to writemask
- * and - result, mask
- * nand - dest, mask
- * or - dest, result
- */
-static boolean mask_write( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg result,
- unsigned mask )
-{
- struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, tmp, imm_swz,
- SHUF((mask & 1) ? 2 : 3,
- (mask & 2) ? 2 : 3,
- (mask & 4) ? 2 : 3,
- (mask & 8) ? 2 : 3));
-
- sse_andps(cp->func, dst, tmp);
- sse_andnps(cp->func, tmp, result);
- sse_orps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- return TRUE;
-}
-
-
-
-
-/* Helper for writemask:
- */
-static boolean emit_shuf_copy2( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg arg0,
- struct x86_reg arg1,
- ubyte shuf )
-{
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, dst, arg1, shuf);
- emit_pshufd(cp, tmp, arg0, shuf);
- sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
- emit_pshufd(cp, dst, dst, shuf);
-
- aos_release_xmm_reg(cp, tmp.idx);
- return TRUE;
-}
-
-
-
-#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-
-
-/* Locate a source register and perform any required (simple) swizzle.
- *
- * Just fail on complex swizzles at this point.
- */
-static struct x86_reg fetch_src( struct aos_compilation *cp,
- const struct tgsi_full_src_register *src )
-{
- struct x86_reg arg0 = aos_get_shader_reg(cp,
- src->Register.File,
- src->Register.Index);
- unsigned i;
- ubyte swz = 0;
- unsigned negs = 0;
- unsigned abs = 0;
-
- for (i = 0; i < 4; i++) {
- unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
- unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
-
- swz |= (swizzle & 0x3) << (i * 2);
-
- switch (neg) {
- case TGSI_UTIL_SIGN_TOGGLE:
- negs |= (1<<i);
- break;
-
- case TGSI_UTIL_SIGN_KEEP:
- break;
-
- case TGSI_UTIL_SIGN_CLEAR:
- abs |= (1<<i);
- break;
-
- default:
- AOS_ERROR(cp, "unsupported sign-mode");
- break;
- }
- }
-
- if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
- struct x86_reg dst = aos_get_xmm_reg(cp);
-
- if (swz != SSE_SWIZZLE_NOOP)
- emit_pshufd(cp, dst, arg0, swz);
- else
- sse_movaps(cp->func, dst, arg0);
-
- if (negs && negs != 0xf) {
- struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- /* Load 1,-1,0,0
- * Use neg as arg to pshufd
- * Multiply
- */
- emit_pshufd(cp, tmp, imm_swz,
- SHUF((negs & 1) ? 1 : 0,
- (negs & 2) ? 1 : 0,
- (negs & 4) ? 1 : 0,
- (negs & 8) ? 1 : 0));
- sse_mulps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- aos_soft_release_xmm(cp, imm_swz);
- }
- else if (negs) {
- struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
- sse_mulps(cp->func, dst, imm_negs);
- aos_soft_release_xmm(cp, imm_negs);
- }
-
-
- if (abs && abs != 0xf) {
- AOS_ERROR(cp, "unsupported partial abs");
- }
- else if (abs) {
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, dst);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- aos_soft_release_xmm(cp, neg);
- }
-
- aos_soft_release_xmm(cp, arg0);
- return dst;
- }
-
- return arg0;
-}
-
-static void x87_fld_src( struct aos_compilation *cp,
- const struct tgsi_full_src_register *src,
- unsigned channel )
-{
- struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
- src->Register.File,
- src->Register.Index);
-
- unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
- unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
-
- x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
-
- switch (neg) {
- case TGSI_UTIL_SIGN_TOGGLE:
- /* Flip the sign:
- */
- x87_fchs( cp->func );
- break;
-
- case TGSI_UTIL_SIGN_KEEP:
- break;
-
- case TGSI_UTIL_SIGN_CLEAR:
- x87_fabs( cp->func );
- break;
-
- case TGSI_UTIL_SIGN_SET:
- x87_fabs( cp->func );
- x87_fchs( cp->func );
- break;
-
- default:
- AOS_ERROR(cp, "unsupported sign-mode");
- break;
- }
-}
-
-
-
-
-
-
-/* Used to implement write masking. This and most of the other instructions
- * here would be easier to implement if there had been a translation
- * to a 2 argument format (dst/arg0, arg1) at the shader level before
- * attempting to translate to x86/sse code.
- */
-static void store_dest( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *reg,
- struct x86_reg result )
-{
- struct x86_reg dst;
-
- switch (reg->Register.WriteMask) {
- case 0:
- return;
-
- case TGSI_WRITEMASK_XYZW:
- aos_adopt_xmm_reg(cp,
- get_xmm_writable(cp, result),
- reg->Register.File,
- reg->Register.Index,
- TRUE);
- return;
- default:
- break;
- }
-
- dst = aos_get_shader_reg_xmm(cp,
- reg->Register.File,
- reg->Register.Index);
-
- switch (reg->Register.WriteMask) {
- case TGSI_WRITEMASK_X:
- sse_movss(cp->func, dst, get_xmm(cp, result));
- break;
-
- case TGSI_WRITEMASK_ZW:
- sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
- break;
-
- case TGSI_WRITEMASK_XY:
- result = get_xmm_writable(cp, result);
- sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
- dst = result;
- break;
-
- case TGSI_WRITEMASK_YZW:
- result = get_xmm_writable(cp, result);
- sse_movss(cp->func, result, dst);
- dst = result;
- break;
-
- default:
- mask_write(cp, dst, result, reg->Register.WriteMask);
- break;
- }
-
- aos_adopt_xmm_reg(cp,
- dst,
- reg->Register.File,
- reg->Register.Index,
- TRUE);
-
-}
-
-static void inject_scalar( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg result,
- ubyte swizzle )
-{
- sse_shufps(cp->func, dst, dst, swizzle);
- sse_movss(cp->func, dst, result);
- sse_shufps(cp->func, dst, dst, swizzle);
-}
-
-
-static void store_scalar_dest( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *reg,
- struct x86_reg result )
-{
- unsigned writemask = reg->Register.WriteMask;
- struct x86_reg dst;
-
- if (writemask != TGSI_WRITEMASK_X &&
- writemask != TGSI_WRITEMASK_Y &&
- writemask != TGSI_WRITEMASK_Z &&
- writemask != TGSI_WRITEMASK_W &&
- writemask != 0)
- {
- result = get_xmm_writable(cp, result); /* already true, right? */
- sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
- store_dest(cp, reg, result);
- return;
- }
-
- result = get_xmm(cp, result);
- dst = aos_get_shader_reg_xmm(cp,
- reg->Register.File,
- reg->Register.Index);
-
-
-
- switch (reg->Register.WriteMask) {
- case TGSI_WRITEMASK_X:
- sse_movss(cp->func, dst, result);
- break;
-
- case TGSI_WRITEMASK_Y:
- inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
- break;
-
- case TGSI_WRITEMASK_Z:
- inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
- break;
-
- case TGSI_WRITEMASK_W:
- inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
- break;
-
- default:
- break;
- }
-
- aos_adopt_xmm_reg(cp,
- dst,
- reg->Register.File,
- reg->Register.Index,
- TRUE);
-}
-
-
-
-static void x87_fst_or_nop( struct x86_function *func,
- unsigned writemask,
- unsigned channel,
- struct x86_reg ptr )
-{
- assert(ptr.file == file_REG32);
- if (writemask & (1<<channel))
- x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
-}
-
-static void x87_fstp_or_pop( struct x86_function *func,
- unsigned writemask,
- unsigned channel,
- struct x86_reg ptr )
-{
- assert(ptr.file == file_REG32);
- if (writemask & (1<<channel))
- x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
- else
- x87_fstp( func, x86_make_reg( file_x87, 0 ));
-}
-
-
-
-/*
- */
-static void x87_fstp_dest4( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *dst )
-{
- struct x86_reg ptr = get_dst_ptr(cp, dst);
- unsigned writemask = dst->Register.WriteMask;
-
- x87_fst_or_nop(cp->func, writemask, 0, ptr);
- x87_fst_or_nop(cp->func, writemask, 1, ptr);
- x87_fst_or_nop(cp->func, writemask, 2, ptr);
- x87_fstp_or_pop(cp->func, writemask, 3, ptr);
-}
-
-/* Save current x87 state and put it into single precision mode.
- */
-static void save_fpu_state( struct aos_compilation *cp )
-{
- x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_restore)));
-}
-
-static void restore_fpu_state( struct aos_compilation *cp )
-{
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_restore)));
-}
-
-static void set_fpu_round_neg_inf( struct aos_compilation *cp )
-{
- if (cp->fpucntl != FPU_RND_NEG) {
- cp->fpucntl = FPU_RND_NEG;
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_rnd_neg_inf)));
- }
-}
-
-static void set_fpu_round_nearest( struct aos_compilation *cp )
-{
- if (cp->fpucntl != FPU_RND_NEAREST) {
- cp->fpucntl = FPU_RND_NEAREST;
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_rnd_nearest)));
- }
-}
-
-#if 0
-static void x87_emit_ex2( struct aos_compilation *cp )
-{
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- int stack = cp->func->x87_stack;
-
- /* set_fpu_round_neg_inf( cp ); */
-
- x87_fld(cp->func, st0); /* a a */
- x87_fprndint( cp->func ); /* int(a) a*/
- x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
- x87_fxch(cp->func, st1); /* frc(a) int(a) */
- x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
- x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
- x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
- x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
- /* 2^a int(a) */
- x87_fstp(cp->func, st1); /* 2^a */
-
- assert( stack == cp->func->x87_stack);
-
-}
-#endif
-
-#if 0
-static void PIPE_CDECL print_reg( const char *msg,
- const float *reg )
-{
- debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
-}
-#endif
-
-#if 0
-static void emit_print( struct aos_compilation *cp,
- const char *message, /* must point to a static string! */
- unsigned file,
- unsigned idx )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
- unsigned i;
-
- /* There shouldn't be anything on the x87 stack. Can add this
- * capacity later if need be.
- */
- assert(cp->func->x87_stack == 0);
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too. We're obviously not concerned about performance on this
- * debug path, so here goes:
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
-
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
-
- /* Push the arguments:
- */
- x86_lea( cp->func, ecx, arg );
- x86_push( cp->func, ecx );
- x86_push_imm32( cp->func, (int)message );
-
- /* Call the helper. Could call debug_printf directly, but
- * print_reg is a nice place to put a breakpoint if need be.
- */
- x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
- x86_call( cp->func, ecx );
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
-
- /* Pop caller-save regs
- */
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Done...
- */
-}
-#endif
-
-/**
- * The traditional instructions. All operate on internal registers
- * and ignore write masks and swizzling issues.
- */
-
-static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, arg0);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, tmp, arg0);
-
- store_dest(cp, &op->Dst[0], tmp);
- return TRUE;
-}
-
-static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_addps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_fcos(cp->func);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-/* The dotproduct instructions don't really do that well in sse:
- * XXX: produces wrong results -- disabled.
- */
-static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
- /* Now the hard bit: sum the first 3 values:
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- /* Now the hard bit: sum the values:
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- /* Now the hard bit: sum the values (from DP3):
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
- emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-
-/* dst[0] = 1.0 * 1.0F; */
-/* dst[1] = arg0[1] * arg1[1]; */
-/* dst[2] = arg0[2] * 1.0; */
-/* dst[3] = 1.0 * arg1[3]; */
-
- emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
- emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
- sse_mulps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld1(cp->func); /* 1 */
- x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */
- x87_fyl2x(cp->func); /* log2(a0) */
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-#if 0
-static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_emit_ex2(cp);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-#endif
-
-
-static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_neg_inf( cp );
-
- /* Load all sources first to avoid aliasing
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fprndint( cp->func );
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_nearest( cp );
-
- /* Load all sources first to avoid aliasing
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fprndint( cp->func );
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_neg_inf( cp );
-
- /* suck all the source values onto the stack before writing out any
- * dst, which may alias...
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fld(cp->func, st0); /* a a */
- x87_fprndint( cp->func ); /* flr(a) a */
- x87_fsubp(cp->func, st1); /* frc(a) */
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-
-
-
-
-static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- unsigned writemask = op->Dst[0].Register.WriteMask;
- unsigned lit_count = cp->lit_count++;
- struct x86_reg result, arg0;
- unsigned i;
-
-#if 1
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-#endif
-
- if (writemask != TGSI_WRITEMASK_XYZW)
- result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
- else
- result = get_dst_ptr(cp, &op->Dst[0]);
-
-
- arg0 = fetch_src( cp, &op->Src[0] );
- if (arg0.file == file_XMM) {
- struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, tmp[1]));
- sse_movaps( cp->func, tmp, arg0 );
- arg0 = tmp;
- }
-
-
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- /* Push the arguments:
- */
- x86_push_imm32( cp->func, lit_count );
-
- x86_lea( cp->func, ecx, arg0 );
- x86_push( cp->func, ecx );
-
- x86_lea( cp->func, ecx, result );
- x86_push( cp->func, ecx );
-
- x86_push( cp->func, cp->machine_EDX );
-
- if (lit_count < MAX_LIT_INFO) {
- x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
- Offset(struct aos_machine, lit_info) +
- lit_count * sizeof(struct lit_info) +
- Offset(struct lit_info, func)));
- }
- else {
- x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
- }
-
- x86_call( cp->func, ecx );
-
- x86_pop( cp->func, ecx ); /* fixme... */
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- if (writemask != TGSI_WRITEMASK_XYZW) {
- store_dest( cp,
- &op->Dst[0],
- get_xmm_writable( cp, result ) );
- }
-
- return TRUE;
-}
-
-#if 0
-static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
-
- if (writemask & TGSI_WRITEMASK_YZ) {
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- struct x86_reg st2 = x86_make_reg(file_x87, 2);
-
- /* a1' = a1 <= 0 ? 1 : a1;
- */
- x87_fldz(cp->func); /* 1 0 */
-#if 1
- x87_fld1(cp->func); /* 1 0 */
-#else
- /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
- */
- x87_fldz(cp->func); /* 1 0 */
-#endif
- x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */
- x87_fcomi(cp->func, st2); /* a1 1 0 */
- x87_fcmovb(cp->func, st1); /* a1' 1 0 */
- x87_fstp(cp->func, st1); /* a1' 0 */
- x87_fstp(cp->func, st1); /* a1' */
-
- x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */
- x87_fxch(cp->func, st1); /* a1' a3 */
-
-
- /* Compute pow(a1, a3)
- */
- x87_fyl2x(cp->func); /* a3*log2(a1) */
- x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
-
-
- /* a0' = max2(a0, 0):
- */
- x87_fldz(cp->func); /* 0 r2 */
- x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
- x87_fcomi(cp->func, st1);
- x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
-
- x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
-
- x87_fcomi(cp->func, st1); /* a0' 0 r2 */
- x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
-
- x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
- x87_fpop(cp->func); /* r2 */
- x87_fpop(cp->func);
- }
-
- if (writemask & TGSI_WRITEMASK_XW) {
- x87_fld1(cp->func);
- x87_fst_or_nop(cp->func, writemask, 0, dst);
- x87_fstp_or_pop(cp->func, writemask, 3, dst);
- }
-
- return TRUE;
-}
-#endif
-
-
-
-static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_maxps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_minps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- /* potentially nothing to do */
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
-
- /* If we can't clobber old contents of arg0, get a temporary & copy
- * it there, then clobber it...
- */
- arg0 = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, arg0, arg1);
- sse_addps(cp->func, arg0, arg2);
- store_dest(cp, &op->Dst[0], arg0);
- return TRUE;
-}
-
-
-
-/* A wrapper for powf().
- * Makes sure it is cdecl and operates on floats.
- */
-static float PIPE_CDECL _powerf( float x, float y )
-{
-#if FAST_MATH
- return util_fast_pow(x, y);
-#else
- return powf( x, y );
-#endif
-}
-
-#if FAST_MATH
-static float PIPE_CDECL _exp2(float x)
-{
- return util_fast_exp2(x);
-}
-#endif
-
-
-/* Really not sufficient -- need to check for conditions that could
- * generate inf/nan values, which will slow things down hugely.
- */
-static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-#if 0
- x87_fld_src(cp, &op->Src[1], 0); /* a1.x */
- x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */
- x87_fyl2x(cp->func); /* a1*log2(a0) */
-
- x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
-
- x87_fstp_dest4(cp, &op->Dst[0]);
-#else
- uint i;
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
-
- x87_fld_src( cp, &op->Src[1], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
- x87_fld_src( cp, &op->Src[0], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
- /* tmp_EAX has been pushed & will be restored below */
- x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
- x86_call( cp->func, cp->tmp_EAX );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Note retval on x87 stack:
- */
- cp->func->x87_stack++;
-
- x87_fstp_dest4( cp, &op->Dst[0] );
-#endif
- return TRUE;
-}
-
-
-#if FAST_MATH
-static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- uint i;
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
-
- x87_fld_src( cp, &op->Src[0], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
- /* tmp_EAX has been pushed & will be restored below */
- x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
- x86_call( cp->func, cp->tmp_EAX );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Note retval on x87 stack:
- */
- cp->func->x87_stack++;
-
- x87_fstp_dest4( cp, &op->Dst[0] );
-
- return TRUE;
-}
-#endif
-
-
-static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
-
- if (cp->have_sse2) {
- sse2_rcpss(cp->func, dst, arg0);
- /* extend precision here...
- */
- }
- else {
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- sse_movss(cp->func, dst, ones);
- sse_divss(cp->func, dst, arg0);
- }
-
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-/* Although rsqrtps() and rcpps() are low precision on some/all SSE
- * implementations, it is possible to improve its precision at
- * fairly low cost, using a newton/raphson step, as below:
- *
- * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
- * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
- * or:
- * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
- *
- *
- * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
- */
-static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- if (0) {
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg r = aos_get_xmm_reg(cp);
- sse_rsqrtss(cp->func, r, arg0);
- store_scalar_dest(cp, &op->Dst[0], r);
- return TRUE;
- }
- else {
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg r = aos_get_xmm_reg(cp);
-
- struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
- struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
- struct x86_reg src = get_xmm_writable( cp, arg0 );
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, src);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, tmp, src);
-
- sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
- sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
- sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
- sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
- sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
- sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
-
- store_scalar_dest(cp, &op->Dst[0], r);
-
- aos_release_xmm_reg(cp, tmp.idx);
-
- return TRUE;
- }
-}
-
-
-static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
- sse_andps(cp->func, dst, ones);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_fsin(cp->func);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-
-
-static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_cmpps(cp->func, dst, arg1, cc_LessThan);
- sse_andps(cp->func, dst, ones);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_subps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg tmp0 = aos_get_xmm_reg(cp);
-
- sse2_cvttps2dq(cp->func, tmp0, arg0);
- sse2_cvtdq2ps(cp->func, tmp0, tmp0);
-
- store_dest(cp, &op->Dst[0], tmp0);
- return TRUE;
-}
-
-static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp0 = aos_get_xmm_reg(cp);
- struct x86_reg tmp1 = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
- sse_mulps(cp->func, tmp1, arg0);
- emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
- sse_mulps(cp->func, tmp0, arg1);
- sse_subps(cp->func, tmp1, tmp0);
- sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
-
-/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
-/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
-/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
-/* dst[3] is undef */
-
-
- aos_release_xmm_reg(cp, tmp0.idx);
- store_dest(cp, &op->Dst[0], tmp1);
- return TRUE;
-}
-
-
-
-static boolean
-emit_instruction( struct aos_compilation *cp,
- struct tgsi_full_instruction *inst )
-{
- x87_assert_stack_empty(cp->func);
-
- switch( inst->Instruction.Opcode ) {
- case TGSI_OPCODE_MOV:
- return emit_MOV( cp, inst );
-
- case TGSI_OPCODE_LIT:
- return emit_LIT(cp, inst);
-
- case TGSI_OPCODE_RCP:
- return emit_RCP(cp, inst);
-
- case TGSI_OPCODE_RSQ:
- return emit_RSQ(cp, inst);
-
- case TGSI_OPCODE_EXP:
- /*return emit_EXP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_LOG:
- /*return emit_LOG(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_MUL:
- return emit_MUL(cp, inst);
-
- case TGSI_OPCODE_ADD:
- return emit_ADD(cp, inst);
-
- case TGSI_OPCODE_DP3:
- return emit_DP3(cp, inst);
-
- case TGSI_OPCODE_DP4:
- return emit_DP4(cp, inst);
-
- case TGSI_OPCODE_DST:
- return emit_DST(cp, inst);
-
- case TGSI_OPCODE_MIN:
- return emit_MIN(cp, inst);
-
- case TGSI_OPCODE_MAX:
- return emit_MAX(cp, inst);
-
- case TGSI_OPCODE_SLT:
- return emit_SLT(cp, inst);
-
- case TGSI_OPCODE_SGE:
- return emit_SGE(cp, inst);
-
- case TGSI_OPCODE_MAD:
- return emit_MAD(cp, inst);
-
- case TGSI_OPCODE_SUB:
- return emit_SUB(cp, inst);
-
- case TGSI_OPCODE_LRP:
- /*return emit_LERP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_FRC:
- return emit_FRC(cp, inst);
-
- case TGSI_OPCODE_CLAMP:
- /*return emit_CLAMP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_FLR:
- return emit_FLR(cp, inst);
-
- case TGSI_OPCODE_ROUND:
- return emit_RND(cp, inst);
-
- case TGSI_OPCODE_EX2:
-#if FAST_MATH
- return emit_EXPBASE2(cp, inst);
-#elif 0
- /* this seems to fail for "larger" exponents.
- * See glean tvertProg1's EX2 test.
- */
- return emit_EX2(cp, inst);
-#else
- return FALSE;
-#endif
-
- case TGSI_OPCODE_LG2:
- return emit_LG2(cp, inst);
-
- case TGSI_OPCODE_POW:
- return emit_POW(cp, inst);
-
- case TGSI_OPCODE_XPD:
- return emit_XPD(cp, inst);
-
- case TGSI_OPCODE_ABS:
- return emit_ABS(cp, inst);
-
- case TGSI_OPCODE_DPH:
- return emit_DPH(cp, inst);
-
- case TGSI_OPCODE_COS:
- return emit_COS(cp, inst);
-
- case TGSI_OPCODE_SIN:
- return emit_SIN(cp, inst);
-
- case TGSI_OPCODE_TRUNC:
- return emit_TRUNC(cp, inst);
-
- case TGSI_OPCODE_END:
- return TRUE;
-
- default:
- return FALSE;
- }
-}
-
-
-static boolean emit_viewport( struct aos_compilation *cp )
-{
- struct x86_reg pos = aos_get_shader_reg_xmm(cp,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output );
-
- struct x86_reg scale = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, scale));
-
- struct x86_reg translate = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, translate));
-
- sse_mulps(cp->func, pos, scale);
- sse_addps(cp->func, pos, translate);
-
- aos_adopt_xmm_reg( cp,
- pos,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output,
- TRUE );
- return TRUE;
-}
-
-
-/* This is useful to be able to see the results on softpipe. Doesn't
- * do proper clipping, just assumes the backend can do it during
- * rasterization -- for debug only...
- */
-static boolean emit_rhw_viewport( struct aos_compilation *cp )
-{
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg pos = aos_get_shader_reg_xmm(cp,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output);
-
- struct x86_reg scale = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, scale));
-
- struct x86_reg translate = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, translate));
-
-
-
- emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
- sse2_rcpss(cp->func, tmp, tmp);
- sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
-
- sse_mulps(cp->func, pos, scale);
- sse_mulps(cp->func, pos, tmp);
- sse_addps(cp->func, pos, translate);
-
- /* Set pos[3] = w
- */
- mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
-
- aos_adopt_xmm_reg( cp,
- pos,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output,
- TRUE );
- return TRUE;
-}
-
-
-#if 0
-static boolean note_immediate( struct aos_compilation *cp,
- struct tgsi_full_immediate *imm )
-{
- unsigned pos = cp->num_immediates++;
- unsigned j;
-
- assert( imm->Immediate.NrTokens <= 4 + 1 );
- for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
- cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
- }
-
- return TRUE;
-}
-#endif
-
-
-
-
-static void find_last_write_outputs( struct aos_compilation *cp )
-{
- struct tgsi_parse_context parse;
- unsigned this_instruction = 0;
- unsigned i;
-
- tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
-
- while (!tgsi_parse_end_of_tokens( &parse )) {
-
- tgsi_parse_token( &parse );
-
- if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
- continue;
-
- for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
- if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
- TGSI_FILE_OUTPUT)
- {
- unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
- cp->output_last_write[idx] = this_instruction;
- }
- }
-
- this_instruction++;
- }
-
- tgsi_parse_free( &parse );
-}
-
-
-#define ARG_MACHINE 1
-#define ARG_START_ELTS 2
-#define ARG_COUNT 3
-#define ARG_OUTBUF 4
-
-
-static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant,
- boolean linear )
-{
- struct tgsi_parse_context parse;
- struct aos_compilation cp;
- unsigned fixup, label;
-
- util_init_math();
-
- tgsi_parse_init( &parse, variant->base.vs->state.tokens );
-
- memset(&cp, 0, sizeof(cp));
-
- cp.insn_counter = 1;
- cp.vaos = variant;
- cp.have_sse2 = 1;
- cp.func = &variant->func[ linear ? 0 : 1 ];
-
- cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
- cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
- cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
- cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
- cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
- cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
- cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
-
- x86_init_func(cp.func);
-
- find_last_write_outputs(&cp);
-
- x86_push(cp.func, cp.idx_EBX);
- x86_push(cp.func, cp.count_ESI);
- x86_push(cp.func, cp.temp_EBP);
-
-
- /* Load arguments into regs:
- */
- x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
- x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
- x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
- x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
-
-
- /* Compare count to zero and possibly bail.
- */
- x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
- x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
- fixup = x86_jcc_forward(cp.func, cc_E);
-
-
- save_fpu_state( &cp );
- set_fpu_round_nearest( &cp );
-
- aos_init_inputs( &cp, linear );
-
- cp.x86_reg[0] = 0;
- cp.x86_reg[1] = 0;
-
- /* Note address for loop jump
- */
- label = x86_get_label(cp.func);
- {
- /* Fetch inputs... TODO: fetch lazily...
- */
- if (!aos_fetch_inputs( &cp, linear ))
- goto fail;
-
- /* Emit the shader:
- */
- while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
- {
- tgsi_parse_token( &parse );
-
- switch (parse.FullToken.Token.Type) {
- case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
- if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
- goto fail;
-#endif
- break;
-
- case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (DISASSEM)
- tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
-
- if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
- goto fail;
- break;
- }
-
- x87_assert_stack_empty(cp.func);
- cp.insn_counter++;
-
- if (DISASSEM)
- debug_printf("\n");
- }
-
-
- {
- unsigned i;
- for (i = 0; i < 8; i++) {
- if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
- cp.xmm[i].file = TGSI_FILE_NULL;
- cp.xmm[i].dirty = 0;
- }
- }
- }
-
- if (cp.error)
- goto fail;
-
- if (cp.vaos->base.key.clip) {
- /* not really handling clipping, just do the rhw so we can
- * see the results...
- */
- emit_rhw_viewport(&cp);
- }
- else if (cp.vaos->base.key.viewport) {
- emit_viewport(&cp);
- }
-
- /* Emit output... TODO: do this eagerly after the last write to a
- * given output.
- */
- if (!aos_emit_outputs( &cp ))
- goto fail;
-
-
- /* Next vertex:
- */
- x86_lea(cp.func,
- cp.outbuf_ECX,
- x86_make_disp(cp.outbuf_ECX,
- cp.vaos->base.key.output_stride));
-
- /* Incr index
- */
- aos_incr_inputs( &cp, linear );
- }
- /* decr count, loop if not zero
- */
- x86_dec(cp.func, cp.count_ESI);
- x86_jcc(cp.func, cc_NZ, label);
-
- restore_fpu_state(&cp);
-
- /* Land forward jump here:
- */
- x86_fixup_fwd_jump(cp.func, fixup);
-
- /* Exit mmx state?
- */
- if (cp.func->need_emms)
- mmx_emms(cp.func);
-
- x86_pop(cp.func, cp.temp_EBP);
- x86_pop(cp.func, cp.count_ESI);
- x86_pop(cp.func, cp.idx_EBX);
-
- x87_assert_stack_empty(cp.func);
- x86_ret(cp.func);
-
- tgsi_parse_free( &parse );
- return !cp.error;
-
- fail:
- tgsi_parse_free( &parse );
- return FALSE;
-}
-
-
-/** cast wrapper */
-static INLINE struct draw_vs_variant_aos_sse *
-draw_vs_variant_aos_sse(struct draw_vs_variant *variant)
-{
- return (struct draw_vs_variant_aos_sse *) variant;
-}
-
-
-static void vaos_set_buffer( struct draw_vs_variant *variant,
- unsigned buf,
- const void *ptr,
- unsigned stride,
- unsigned max_stride)
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
- if (buf < vaos->nr_vb) {
- vaos->buffer[buf].base_ptr = (char *)ptr;
- vaos->buffer[buf].stride = stride;
- }
-
- if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
-}
-
-
-
-static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant,
- const unsigned *elts,
- unsigned count,
- void *output_buffer )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
- struct aos_machine *machine = vaos->draw->vs.aos_machine;
- unsigned i;
-
- if (0) debug_printf("%s %d\n", __FUNCTION__, count);
-
- machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
- for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
- machine->constants[i] = vaos->draw->vs.aligned_constants[i];
- }
- machine->immediates = vaos->base.vs->immediates;
- machine->buffer = vaos->buffer;
-
- vaos->gen_run_elts( machine,
- elts,
- count,
- output_buffer );
-}
-
-static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant,
- unsigned start,
- unsigned count,
- void *output_buffer )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
- struct aos_machine *machine = vaos->draw->vs.aos_machine;
- unsigned i;
-
- if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
- vaos->base.key.const_vbuffers);
-
- machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
- for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
- machine->constants[i] = vaos->draw->vs.aligned_constants[i];
- }
- machine->immediates = vaos->base.vs->immediates;
- machine->buffer = vaos->buffer;
-
- vaos->gen_run_linear( machine,
- start,
- count,
- output_buffer );
-
- /* Sanity spot checks to make sure we didn't trash our constants */
- assert(machine->internal[IMM_ONES][0] == 1.0f);
- assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
- assert(machine->internal[IMM_NEGS][0] == -1.0f);
-}
-
-
-
-static void vaos_destroy( struct draw_vs_variant *variant )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
- FREE( vaos->buffer );
-
- x86_release_func( &vaos->func[0] );
- x86_release_func( &vaos->func[1] );
-
- FREE(vaos);
-}
-
-
-
-static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key )
-{
- unsigned i;
- struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse);
-
- if (!vaos)
- goto fail;
-
- vaos->base.key = *key;
- vaos->base.vs = vs;
- vaos->base.set_buffer = vaos_set_buffer;
- vaos->base.destroy = vaos_destroy;
- vaos->base.run_linear = vaos_run_linear;
- vaos->base.run_elts = vaos_run_elts;
-
- vaos->draw = vs->draw;
-
- for (i = 0; i < key->nr_inputs; i++)
- vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
-
- vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
- if (!vaos->buffer)
- goto fail;
-
- if (0)
- debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
-
-#if 0
- tgsi_dump(vs->state.tokens, 0);
-#endif
-
- if (!build_vertex_program( vaos, TRUE ))
- goto fail;
-
- if (!build_vertex_program( vaos, FALSE ))
- goto fail;
-
- vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
- if (!vaos->gen_run_linear)
- goto fail;
-
- vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
- if (!vaos->gen_run_elts)
- goto fail;
-
- return &vaos->base;
-
- fail:
- if (vaos && vaos->buffer)
- FREE(vaos->buffer);
-
- if (vaos)
- x86_release_func( &vaos->func[0] );
-
- if (vaos)
- x86_release_func( &vaos->func[1] );
-
- FREE(vaos);
-
- return NULL;
-}
-
-
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key )
-{
- struct draw_vs_variant *variant = variant_aos_sse( vs, key );
-
- if (variant == NULL) {
- variant = draw_vs_create_variant_generic( vs, key );
- }
-
- return variant;
-}
-
-
-
-#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
deleted file mode 100644
index 55e63d8b9fa..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/* Authors: Keith Whitwell <[email protected]>
- */
-
-#ifndef DRAW_VS_AOS_H
-#define DRAW_VS_AOS_H
-
-#include "pipe/p_config.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-
-#ifdef PIPE_ARCH_X86
-
-struct tgsi_token;
-struct x86_function;
-
-#include "pipe/p_state.h"
-#include "rtasm/rtasm_x86sse.h"
-
-
-
-
-
-#define X 0
-#define Y 1
-#define Z 2
-#define W 3
-
-#define MAX_INPUTS PIPE_MAX_ATTRIBS
-#define MAX_OUTPUTS PIPE_MAX_SHADER_OUTPUTS
-#define MAX_TEMPS TGSI_EXEC_NUM_TEMPS
-#define MAX_CONSTANTS 1024 /** only used for sanity checking */
-#define MAX_IMMEDIATES 1024 /** only used for sanity checking */
-#define MAX_INTERNALS 8 /** see IMM_x values below */
-
-#define AOS_FILE_INTERNAL TGSI_FILE_COUNT
-
-#define FPU_RND_NEG 1
-#define FPU_RND_NEAREST 2
-
-struct aos_machine;
-typedef void (PIPE_CDECL *lit_func)( struct aos_machine *,
- float *result,
- const float *in,
- unsigned count );
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count );
-
-struct shine_tab {
- float exponent;
- float values[258];
- unsigned last_used;
-};
-
-struct lit_info {
- lit_func func;
- struct shine_tab *shine_tab;
-};
-
-#define MAX_SHINE_TAB 4
-#define MAX_LIT_INFO 16
-
-struct aos_buffer {
- const void *base_ptr;
- unsigned stride;
- void *ptr; /* updated per vertex */
-};
-
-
-
-
-/* This is the temporary storage used by all the aos_sse vs variants.
- * Create one per context and reuse by passing a pointer in at
- * vs_variant creation??
- */
-struct aos_machine {
- float input [MAX_INPUTS ][4];
- float output [MAX_OUTPUTS ][4];
- float temp [MAX_TEMPS ][4];
- float internal [MAX_INTERNALS ][4];
-
- float scale[4]; /* viewport */
- float translate[4]; /* viewport */
-
- float tmp[2][4]; /* scratch space for LIT */
-
- struct shine_tab shine_tab[MAX_SHINE_TAB];
- struct lit_info lit_info[MAX_LIT_INFO];
- unsigned now;
-
-
- ushort fpu_rnd_nearest;
- ushort fpu_rnd_neg_inf;
- ushort fpu_restore;
- ushort fpucntl; /* one of FPU_* above */
-
- const float (*immediates)[4]; /* points to shader data */
- const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */
-
- const struct aos_buffer *buffer; /* points to ? */
-};
-
-
-
-
-struct aos_compilation {
- struct x86_function *func;
- struct draw_vs_variant_aos_sse *vaos;
-
- unsigned insn_counter;
- unsigned num_immediates;
- unsigned count;
- unsigned lit_count;
-
- struct {
- unsigned idx:16;
- unsigned file:8;
- unsigned dirty:8;
- unsigned last_used;
- } xmm[8];
-
- unsigned x86_reg[2]; /* one of X86_* */
-
- boolean input_fetched[PIPE_MAX_ATTRIBS];
- unsigned output_last_write[PIPE_MAX_ATTRIBS];
-
- boolean have_sse2;
- boolean error;
- short fpucntl;
-
- /* these are actually known values, but putting them in a struct
- * like this is helpful to keep them in sync across the file.
- */
- struct x86_reg tmp_EAX;
- struct x86_reg idx_EBX; /* either start+i or &elt[i] */
- struct x86_reg outbuf_ECX;
- struct x86_reg machine_EDX;
- struct x86_reg count_ESI; /* decrements to zero */
- struct x86_reg temp_EBP;
- struct x86_reg stack_ESP;
-};
-
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp );
-void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx );
-
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
- struct x86_reg reg,
- unsigned file,
- unsigned idx,
- unsigned dirty );
-
-void aos_spill_all( struct aos_compilation *cp );
-
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
- unsigned file,
- unsigned idx );
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
-
-boolean aos_emit_outputs( struct aos_compilation *cp );
-
-
-#define IMM_ONES 0 /* 1, 1,1,1 */
-#define IMM_SWZ 1 /* 1,-1,0, 0xffffffff */
-#define IMM_IDENTITY 2 /* 0, 0,0,1 */
-#define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */
-#define IMM_255 4 /* 255, 255, 255, 255 */
-#define IMM_NEGS 5 /* -1,-1,-1,-1 */
-#define IMM_RSQ 6 /* -.5,1.5,_,_ */
-#define IMM_PSIZE 7 /* not really an immediate - updated each run */
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
- unsigned imm );
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
- unsigned imm );
-
-
-#define AOS_ERROR(cp, msg) \
-do { \
- if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \
- cp->error = 1; \
-} while (0)
-
-
-#define X86_NULL 0
-#define X86_IMMEDIATES 1
-#define X86_CONSTANTS 2
-#define X86_BUFFERS 3
-
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
- unsigned which_reg,
- unsigned value );
-
-
-typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *,
- const unsigned *elts,
- unsigned count,
- void *output_buffer);
-
-typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *,
- unsigned start,
- unsigned count,
- void *output_buffer);
-
-
-struct draw_vs_variant_aos_sse {
- struct draw_vs_variant base;
- struct draw_context *draw;
-
- struct aos_buffer *buffer;
- unsigned nr_vb;
-
- vaos_run_linear_func gen_run_linear;
- vaos_run_elts_func gen_run_elts;
-
-
- struct x86_function func[2];
-};
-
-
-#endif
-
-#endif
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
deleted file mode 100644
index f1dd4487732..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-
-/* Note - don't yet have to worry about interacting with the code in
- * draw_vs_aos.c as there is no intermingling of generated code...
- * That may have to change, we'll see.
- */
-static void emit_load_R32G32B32A32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movups(cp->func, data, src_ptr);
-}
-
-static void emit_load_R32G32B32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
-#if 1
- sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
- /* data = z ? ? ? */
- sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
- /* data = z ? 0 1 */
- sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
- /* data = ? 0 z 1 */
- sse_movlps(cp->func, data, src_ptr);
- /* data = x y z 1 */
-#else
- sse_movups(cp->func, data, src_ptr);
- /* data = x y z ? */
- sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
- /* data = ? x y z */
- sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
- /* data = 1 x y z */
- sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
- /* data = x y z 1 */
-#endif
-}
-
-static void emit_load_R32G32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
- sse_movlps(cp->func, data, src_ptr);
-}
-
-
-static void emit_load_R32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movss(cp->func, data, src_ptr);
- sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
-}
-
-
-static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movss(cp->func, data, src_ptr);
- sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
- sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
- sse2_cvtdq2ps(cp->func, data, data);
- sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
-}
-
-
-
-/* Extended swizzles? Maybe later.
- */
-static void emit_swizzle( struct aos_compilation *cp,
- struct x86_reg dest,
- struct x86_reg src,
- ubyte shuffle )
-{
- sse_shufps(cp->func, dest, src, shuffle);
-}
-
-
-
-static boolean get_buffer_ptr( struct aos_compilation *cp,
- boolean linear,
- unsigned buf_idx,
- struct x86_reg elt,
- struct x86_reg ptr)
-{
- struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- buf_idx * sizeof(struct aos_buffer));
-
- struct x86_reg buf_stride = x86_make_disp(buf,
- Offset(struct aos_buffer, stride));
- if (linear) {
- struct x86_reg buf_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_ptr);
- x86_mov(cp->func, elt, buf_stride);
- x86_add(cp->func, elt, ptr);
- if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
- x86_mov(cp->func, buf_ptr, elt);
- }
- else {
- struct x86_reg buf_base_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, base_ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_stride);
- x86_imul(cp->func, ptr, elt);
- x86_add(cp->func, ptr, buf_base_ptr);
- }
-
- cp->insn_counter++;
-
- return TRUE;
-}
-
-
-static boolean load_input( struct aos_compilation *cp,
- unsigned idx,
- struct x86_reg bufptr )
-{
- unsigned format = cp->vaos->base.key.element[idx].in.format;
- unsigned offset = cp->vaos->base.key.element[idx].in.offset;
- struct x86_reg dataXMM = aos_get_xmm_reg(cp);
-
- /* Figure out source pointer address:
- */
- struct x86_reg src = x86_make_disp(bufptr, offset);
-
- aos_adopt_xmm_reg( cp,
- dataXMM,
- TGSI_FILE_INPUT,
- idx,
- TRUE );
-
- switch (format) {
- case PIPE_FORMAT_R32_FLOAT:
- emit_load_R32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32_FLOAT:
- emit_load_R32G32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32B32_FLOAT:
- emit_load_R32G32B32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32B32A32_FLOAT:
- emit_load_R32G32B32A32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_A8R8G8B8_UNORM:
- emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
- emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- break;
- case PIPE_FORMAT_R8G8B8A8_UNORM:
- emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
- break;
- default:
- AOS_ERROR(cp, "unhandled input format");
- return FALSE;
- }
-
- return TRUE;
-}
-
-static boolean load_inputs( struct aos_compilation *cp,
- unsigned buffer,
- struct x86_reg ptr )
-{
- unsigned i;
-
- for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
- if (cp->vaos->base.key.element[i].in.buffer == buffer) {
-
- if (!load_input( cp, i, ptr ))
- return FALSE;
-
- cp->insn_counter++;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
-{
- unsigned i;
- for (i = 0; i < cp->vaos->nr_vb; i++) {
- struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- i * sizeof(struct aos_buffer));
-
- struct x86_reg buf_base_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, base_ptr));
-
- if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
- struct x86_reg ptr = cp->tmp_EAX;
-
- x86_mov(cp->func, ptr, buf_base_ptr);
-
- /* Load all inputs for this constant vertex buffer
- */
- load_inputs( cp, i, x86_deref(ptr) );
-
- /* Then just force them out to aos_machine.input[]
- */
- aos_spill_all( cp );
-
- }
- else if (linear) {
-
- struct x86_reg elt = cp->idx_EBX;
- struct x86_reg ptr = cp->tmp_EAX;
-
- struct x86_reg buf_stride = x86_make_disp(buf,
- Offset(struct aos_buffer, stride));
-
- struct x86_reg buf_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_stride);
- x86_imul(cp->func, ptr, elt);
- x86_add(cp->func, ptr, buf_base_ptr);
-
-
- /* In the linear case, keep the buffer pointer instead of the
- * index number.
- */
- if (cp->vaos->nr_vb == 1)
- x86_mov( cp->func, elt, ptr );
- else
- x86_mov( cp->func, buf_ptr, ptr );
-
- cp->insn_counter++;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
-{
- unsigned j;
-
- for (j = 0; j < cp->vaos->nr_vb; j++) {
- if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
- /* just retreive pre-transformed input */
- }
- else if (linear && cp->vaos->nr_vb == 1) {
- load_inputs( cp, 0, cp->idx_EBX );
- }
- else {
- struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
- struct x86_reg ptr = cp->tmp_EAX;
-
- if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
- return FALSE;
-
- if (!load_inputs( cp, j, ptr ))
- return FALSE;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
-{
- if (linear && cp->vaos->nr_vb == 1) {
- struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- (0 * sizeof(struct aos_buffer) +
- Offset(struct aos_buffer, stride)));
-
- x86_add(cp->func, cp->idx_EBX, stride);
- sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
- }
- else if (linear) {
- /* Nothing to do */
- }
- else {
- x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
- }
-
- return TRUE;
-}
-
-
-
-
-
-
-static void emit_store_R32G32B32A32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movups(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32G32B32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movlps(cp->func, dst_ptr, dataXMM);
- sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
- sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
-}
-
-static void emit_store_R32G32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movlps(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
- sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
- sse2_packssdw(cp->func, dataXMM, dataXMM);
- sse2_packuswb(cp->func, dataXMM, dataXMM);
- sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-
-
-static boolean emit_output( struct aos_compilation *cp,
- struct x86_reg ptr,
- struct x86_reg dataXMM,
- enum attrib_emit format )
-{
- switch (format) {
- case EMIT_1F:
- case EMIT_1F_PSIZE:
- emit_store_R32(cp, ptr, dataXMM);
- break;
- case EMIT_2F:
- emit_store_R32G32(cp, ptr, dataXMM);
- break;
- case EMIT_3F:
- emit_store_R32G32B32(cp, ptr, dataXMM);
- break;
- case EMIT_4F:
- emit_store_R32G32B32A32(cp, ptr, dataXMM);
- break;
- case EMIT_4UB:
- emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
- break;
- case EMIT_4UB_BGRA:
- emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
- break;
- default:
- AOS_ERROR(cp, "unhandled output format");
- return FALSE;
- }
-
- return TRUE;
-}
-
-
-
-boolean aos_emit_outputs( struct aos_compilation *cp )
-{
- unsigned i;
-
- for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
- enum attrib_emit format = cp->vaos->base.key.element[i].out.format;
- unsigned offset = cp->vaos->base.key.element[i].out.offset;
- unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
-
- struct x86_reg data;
-
- if (format == EMIT_1F_PSIZE) {
- data = aos_get_internal_xmm( cp, IMM_PSIZE );
- }
- else {
- data = aos_get_shader_reg( cp,
- TGSI_FILE_OUTPUT,
- vs_output );
- }
-
- if (data.file != file_XMM) {
- struct x86_reg tmp = aos_get_xmm_reg( cp );
- sse_movaps(cp->func, tmp, data);
- data = tmp;
- }
-
- if (!emit_output( cp,
- x86_make_disp( cp->outbuf_ECX, offset ),
- data,
- format ))
- return FALSE;
-
- aos_release_xmm_reg( cp, data.idx );
-
- cp->insn_counter++;
- }
-
- return TRUE;
-}
-
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
deleted file mode 100644
index 0eda414ee6a..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "pipe/p_config.h"
-
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#ifdef PIPE_ARCH_X86
-
-#include "rtasm/rtasm_x86sse.h"
-
-
-#define X87_CW_EXCEPTION_INV_OP (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
-#define X87_CW_EXCEPTION_PRECISION (1<<5)
-#define X87_CW_PRECISION_SINGLE (0<<8)
-#define X87_CW_PRECISION_RESERVED (1<<8)
-#define X87_CW_PRECISION_DOUBLE (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
-#define X87_CW_PRECISION_MASK (3<<8)
-#define X87_CW_ROUND_NEAREST (0<<10)
-#define X87_CW_ROUND_DOWN (1<<10)
-#define X87_CW_ROUND_UP (2<<10)
-#define X87_CW_ROUND_ZERO (3<<10)
-#define X87_CW_ROUND_MASK (3<<10)
-#define X87_CW_INFINITY (1<<12)
-
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 0.0F;
- result[3] = 1.0F;
- }
- else
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-static void PIPE_CDECL do_lit_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 0.0F;
- result[3] = 1.0F;
- return;
- }
-
- if (machine->lit_info[count].shine_tab->exponent != in[3]) {
- machine->lit_info[count].func = aos_do_lit;
- goto no_luck;
- }
-
- if (in[1] <= 1.0)
- {
- const float *tab = machine->lit_info[count].shine_tab->values;
- float f = in[1] * 256;
- int k = (int)f;
- float frac = f - (float)k;
-
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
- result[3] = 1.0;
- return;
- }
-
- no_luck:
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-static void do_populate_lut( struct shine_tab *tab,
- float unclamped_exponent )
-{
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
- unsigned i;
-
- tab->exponent = unclamped_exponent; /* for later comparison */
-
- tab->values[0] = 0;
- if (exponent == 0) {
- for (i = 1; i < 258; i++) {
- tab->values[i] = 1.0;
- }
- }
- else {
- for (i = 1; i < 258; i++) {
- tab->values[i] = powf((float)i * epsilon, exponent);
- }
- }
-}
-
-
-
-
-static void PIPE_CDECL populate_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- unsigned i, tab;
-
- /* Search for an existing table for this value. Note that without
- * static analysis we don't really know if in[3] will be constant,
- * but it usually is...
- */
- for (tab = 0; tab < 4; tab++) {
- if (machine->shine_tab[tab].exponent == in[3]) {
- goto found;
- }
- }
-
- for (tab = 0, i = 1; i < 4; i++) {
- if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
- tab = i;
- }
-
- if (machine->shine_tab[tab].last_used == machine->now) {
- /* No unused tables (this is not a ffvertex program...). Just
- * call pow each time:
- */
- machine->lit_info[count].func = aos_do_lit;
- machine->lit_info[count].func( machine, result, in, count );
- return;
- }
- else {
- do_populate_lut( &machine->shine_tab[tab], in[3] );
- }
-
- found:
- machine->shine_tab[tab].last_used = machine->now;
- machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
- machine->lit_info[count].func = do_lit_lut;
- machine->lit_info[count].func( machine, result, in, count );
-}
-
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants)
-{
- machine->constants[slot] = constants;
-
- {
- unsigned i;
- for (i = 0; i < MAX_LIT_INFO; i++) {
- machine->lit_info[i].func = populate_lut;
- machine->now++;
- }
- }
-}
-
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport )
-{
- memcpy(machine->scale, viewport->scale, 4 * sizeof(float));
- memcpy(machine->translate, viewport->translate, 4 * sizeof(float));
-}
-
-
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
- align_free(machine);
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
- struct aos_machine *machine;
- unsigned i;
- float inv = 1.0f/255.0f;
- float f255 = 255.0f;
-
- machine = align_malloc(sizeof(struct aos_machine), 16);
- if (!machine)
- return NULL;
-
- memset(machine, 0, sizeof(*machine));
-
- ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f);
- *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
-
- ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f);
- ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv);
- ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255);
- ASSIGN_4V(machine->internal[IMM_RSQ], -.5f, 1.5f, 0.0f, 0.0f);
-
-
- machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_NEAREST |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- assert(machine->fpu_rnd_nearest == 0x37f);
-
- machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_DOWN |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- for (i = 0; i < MAX_SHINE_TAB; i++)
- do_populate_lut( &machine->shine_tab[i], 1.0f );
-
- return machine;
-}
-
-#else
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport )
-{
-}
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants)
-{
-}
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
- return NULL;
-}
-#endif
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index cf894bbe8af..7fb0e0953e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -185,12 +185,7 @@ draw_create_vs_ppc(struct draw_context *draw,
tgsi_scan_shader(templ->tokens, &vs->base.info);
vs->base.draw = draw;
-#if 0
- if (1)
- vs->base.create_variant = draw_vs_variant_aos_ppc;
- else
-#endif
- vs->base.create_variant = draw_vs_create_variant_generic;
+ vs->base.create_variant = draw_vs_create_variant_generic;
vs->base.prepare = vs_ppc_prepare;
vs->base.run_linear = vs_ppc_run_linear;
vs->base.delete = vs_ppc_delete;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
deleted file mode 100644
index d918579bda4..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
- * Authors:
- * Keith Whitwell <[email protected]>
- * Brian Paul
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_config.h"
-
-#include "draw_vs.h"
-
-#if defined(PIPE_ARCH_X86)
-
-#include "pipe/p_shader_tokens.h"
-
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "rtasm/rtasm_cpu.h"
-#include "rtasm/rtasm_x86sse.h"
-#include "tgsi/tgsi_sse2.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_exec.h"
-
-#define SSE_MAX_VERTICES 4
-
-
-struct draw_sse_vertex_shader {
- struct draw_vertex_shader base;
- struct x86_function sse2_program;
-
- tgsi_sse2_vs_func func;
-
- struct tgsi_exec_machine *machine;
-};
-
-
-static void
-vs_sse_prepare( struct draw_vertex_shader *base,
- struct draw_context *draw )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
- struct tgsi_exec_machine *machine = shader->machine;
-
- machine->Samplers = draw->vs.samplers;
-
- if (base->info.uses_instanceid) {
- unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INSTANCEID];
- assert(i < Elements(machine->SystemValue));
- machine->SystemValue[i][0] = base->draw->instance_id;
- }
-}
-
-
-
-/* Simplified vertex shader interface for the pt paths. Given the
- * complexity of code-generating all the above operations together,
- * it's time to try doing all the other stuff separately.
- */
-static void
-vs_sse_run_linear( struct draw_vertex_shader *base,
- const float (*input)[4],
- float (*output)[4],
- const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
- const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
- unsigned count,
- unsigned input_stride,
- unsigned output_stride )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
- struct tgsi_exec_machine *machine = shader->machine;
- unsigned int i;
-
- /* By default, execute all channels. XXX move this inside the loop
- * below when we support shader conditionals/loops.
- */
- tgsi_set_exec_mask(machine, 1, 1, 1, 1);
-
- for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
- unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
-
- if (max_vertices < 4) {
- /* disable the unused execution channels */
- tgsi_set_exec_mask(machine,
- 1,
- max_vertices > 1,
- max_vertices > 2,
- 0);
- }
-
- /* run compiled shader
- */
- shader->func(machine,
- (const float (*)[4])constants[0],
- shader->base.immediates,
- input,
- base->info.num_inputs,
- input_stride,
- output,
- base->info.num_outputs,
- output_stride );
-
- input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
- output = (float (*)[4])((char *)output + output_stride * max_vertices);
- }
-}
-
-
-
-
-static void
-vs_sse_delete( struct draw_vertex_shader *base )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-
- x86_release_func( &shader->sse2_program );
-
- align_free( (void *) shader->base.immediates );
-
- FREE( (void*) shader->base.state.tokens );
- FREE( shader );
-}
-
-
-struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
- const struct pipe_shader_state *templ)
-{
- struct draw_sse_vertex_shader *vs;
-
- if (!rtasm_cpu_has_sse2())
- return NULL;
-
- vs = CALLOC_STRUCT( draw_sse_vertex_shader );
- if (vs == NULL)
- return NULL;
-
- /* we make a private copy of the tokens */
- vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
- if (!vs->base.state.tokens)
- goto fail;
-
- tgsi_scan_shader(templ->tokens, &vs->base.info);
-
- vs->base.draw = draw;
- if (1)
- vs->base.create_variant = draw_vs_create_variant_aos_sse;
- else
- vs->base.create_variant = draw_vs_create_variant_generic;
- vs->base.prepare = vs_sse_prepare;
- vs->base.run_linear = vs_sse_run_linear;
- vs->base.delete = vs_sse_delete;
-
- vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
- sizeof(float), 16);
-
- vs->machine = draw->vs.machine;
-
- x86_init_func( &vs->sse2_program );
-
- if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
- &vs->sse2_program,
- (float (*)[4])vs->base.immediates,
- TRUE ))
- goto fail;
-
- vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );
- if (!vs->func) {
- goto fail;
- }
-
- return &vs->base;
-
-fail:
- if (0)
- debug_warning("tgsi_emit_sse2() failed, falling back to interpreter\n");
-
- x86_release_func( &vs->sse2_program );
-
- FREE(vs);
- return NULL;
-}
-
-
-
-#else
-
-struct draw_vertex_shader *
-draw_create_vs_sse( struct draw_context *draw,
- const struct pipe_shader_state *templ )
-{
- return (void *) 0;
-}
-
-
-#endif
-