summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJosé Fonseca <[email protected]>2011-11-08 00:10:47 +0000
committerJosé Fonseca <[email protected]>2011-11-08 22:57:34 +0000
commit4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 (patch)
tree857d6c1740eb32fc86744f7afd81322862f6150c /src
parent207a016ecaabbccf865a5b8e026b95a4276adc15 (diff)
Remove tgsi_sse2.
tgsi_exec is simple. llvm is fast. tgsi_sse2 ends up being neither.
Diffstat (limited to 'src')
-rw-r--r--src/gallium/auxiliary/Makefile.sources5
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.c27
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.h20
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.c2267
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.h255
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_io.c460
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_machine.c328
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_ppc.c7
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c225
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c3106
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.h80
-rw-r--r--src/gallium/drivers/softpipe/Android.mk1
-rw-r--r--src/gallium/drivers/softpipe/Makefile1
-rw-r--r--src/gallium/drivers/softpipe/SConscript1
-rw-r--r--src/gallium/drivers/softpipe/sp_context.c6
-rw-r--r--src/gallium/drivers/softpipe/sp_context.h1
-rw-r--r--src/gallium/drivers/softpipe/sp_fs.h4
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c248
-rw-r--r--src/gallium/drivers/softpipe/sp_state_shader.c5
20 files changed, 3 insertions, 7048 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 766beb0fafc..baded909cec 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -33,12 +33,8 @@ C_SOURCES := \
draw/draw_pt_vsplit.c \
draw/draw_vertex.c \
draw/draw_vs.c \
- draw/draw_vs_aos.c \
- draw/draw_vs_aos_io.c \
- draw/draw_vs_aos_machine.c \
draw/draw_vs_exec.c \
draw/draw_vs_ppc.c \
- draw/draw_vs_sse.c \
draw/draw_vs_variant.c \
os/os_misc.c \
os/os_stream.c \
@@ -83,7 +79,6 @@ C_SOURCES := \
tgsi/tgsi_ppc.c \
tgsi/tgsi_sanity.c \
tgsi/tgsi_scan.c \
- tgsi/tgsi_sse2.c \
tgsi/tgsi_text.c \
tgsi/tgsi_transform.c \
tgsi/tgsi_ureg.c \
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index b84d2b77179..3521a035e2f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -237,10 +237,6 @@ struct draw_context
uint num_samplers;
struct tgsi_sampler **samplers;
- /* Here's another one:
- */
- struct aos_machine *aos_machine;
-
const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS];
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 1763dbc199f..957bbe57a82 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -81,14 +81,12 @@ draw_vs_set_constants(struct draw_context *draw,
}
draw->vs.aligned_constants[slot] = constants;
- draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants);
}
void draw_vs_set_viewport( struct draw_context *draw,
const struct pipe_viewport_state *viewport )
{
- draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport );
}
@@ -103,22 +101,8 @@ draw_create_vertex_shader(struct draw_context *draw,
tgsi_dump(shader->tokens, 0);
}
- if (!draw->pt.middle.llvm) {
-#if 0
-/* these paths don't support vertex clamping
- * TODO: either add it, or remove them completely
- * use LLVM instead if you want performance
- * use exec instead if you want debugging/more correctness
- */
-#if defined(PIPE_ARCH_X86)
- vs = draw_create_vs_sse( draw, shader );
-#elif defined(PIPE_ARCH_PPC)
- vs = draw_create_vs_ppc( draw, shader );
-#endif
-#endif
- }
#if HAVE_LLVM
- else {
+ if (draw->pt.middle.llvm) {
vs = draw_create_vs_llvm(draw, shader);
}
#endif
@@ -199,12 +183,6 @@ draw_vs_init( struct draw_context *draw )
if (!draw->vs.fetch_cache)
return FALSE;
- draw->vs.aos_machine = draw_vs_aos_machine();
-#ifdef PIPE_ARCH_X86
- if (!draw->vs.aos_machine)
- return FALSE;
-#endif
-
return TRUE;
}
@@ -219,9 +197,6 @@ draw_vs_destroy( struct draw_context *draw )
if (draw->vs.emit_cache)
translate_cache_destroy(draw->vs.emit_cache);
- if (draw->vs.aos_machine)
- draw_vs_aos_machine_destroy(draw->vs.aos_machine);
-
for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
if (draw->vs.aligned_constant_storage[i]) {
align_free((void *)draw->vs.aligned_constant_storage[i]);
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index e6d187e9774..49229f8164b 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -159,10 +159,6 @@ draw_create_vs_exec(struct draw_context *draw,
const struct pipe_shader_state *templ);
struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
- const struct pipe_shader_state *templ);
-
-struct draw_vertex_shader *
draw_create_vs_ppc(struct draw_context *draw,
const struct pipe_shader_state *templ);
@@ -170,10 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw,
struct draw_vs_variant_key;
struct draw_vertex_shader;
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key );
-
#if HAVE_LLVM
struct draw_vertex_shader *
draw_create_vs_llvm(struct draw_context *draw,
@@ -214,18 +206,6 @@ static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key
}
-struct aos_machine *draw_vs_aos_machine( void );
-void draw_vs_aos_machine_destroy( struct aos_machine *machine );
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants);
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport );
-
-
#define MAX_TGSI_VERTICES 4
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
deleted file mode 100644
index 7b90dba0cd5..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ /dev/null
@@ -1,2267 +0,0 @@
-/*
- * Mesa 3-D graphics library
- * Version: 6.3
- *
- * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
- * using the rtasm runtime assembler. Based on the old
- * t_vb_arb_program_sse.c
- */
-
-
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-#define DISASSEM 0
-#define FAST_MATH 1
-
-static const char *files[] =
-{
- "NULL",
- "CONST",
- "IN",
- "OUT",
- "TEMP",
- "SAMP",
- "ADDR",
- "IMM",
- "INTERNAL",
-};
-
-static INLINE boolean eq( struct x86_reg a,
- struct x86_reg b )
-{
- return (a.file == b.file &&
- a.idx == b.idx &&
- a.mod == b.mod &&
- a.disp == b.disp);
-}
-
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
- unsigned which_reg, /* quick hack */
- unsigned value )
-{
- struct x86_reg reg;
-
- if (which_reg == 0)
- reg = cp->temp_EBP;
- else
- reg = cp->tmp_EAX;
-
- if (cp->x86_reg[which_reg] != value) {
- unsigned offset;
-
- switch (value) {
- case X86_IMMEDIATES:
- assert(which_reg == 0);
- offset = Offset(struct aos_machine, immediates);
- break;
- case X86_CONSTANTS:
- assert(which_reg == 1);
- offset = Offset(struct aos_machine, constants);
- break;
- case X86_BUFFERS:
- assert(which_reg == 0);
- offset = Offset(struct aos_machine, buffer);
- break;
- default:
- assert(0);
- offset = 0;
- }
-
-
- x86_mov(cp->func, reg,
- x86_make_disp(cp->machine_EDX, offset));
-
- cp->x86_reg[which_reg] = value;
- }
-
- return reg;
-}
-
-
-static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- struct x86_reg ptr = cp->machine_EDX;
-
- switch (file) {
- case TGSI_FILE_INPUT:
- assert(idx < MAX_INPUTS);
- return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
-
- case TGSI_FILE_OUTPUT:
- return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
-
- case TGSI_FILE_TEMPORARY:
- assert(idx < MAX_TEMPS);
- return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
-
- case AOS_FILE_INTERNAL:
- assert(idx < MAX_INTERNALS);
- return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
-
- case TGSI_FILE_IMMEDIATE:
- assert(idx < MAX_IMMEDIATES); /* just a sanity check */
- return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
-
- case TGSI_FILE_CONSTANT:
- assert(idx < MAX_CONSTANTS); /* just a sanity check */
- return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
-
- default:
- AOS_ERROR(cp, "unknown reg file");
- return x86_make_reg(0,0);
- }
-}
-
-
-
-#define X87_CW_EXCEPTION_INV_OP (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
-#define X87_CW_EXCEPTION_PRECISION (1<<5)
-#define X87_CW_PRECISION_SINGLE (0<<8)
-#define X87_CW_PRECISION_RESERVED (1<<8)
-#define X87_CW_PRECISION_DOUBLE (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
-#define X87_CW_PRECISION_MASK (3<<8)
-#define X87_CW_ROUND_NEAREST (0<<10)
-#define X87_CW_ROUND_DOWN (1<<10)
-#define X87_CW_ROUND_UP (2<<10)
-#define X87_CW_ROUND_ZERO (3<<10)
-#define X87_CW_ROUND_MASK (3<<10)
-#define X87_CW_INFINITY (1<<12)
-
-
-
-
-static void spill( struct aos_compilation *cp, unsigned idx )
-{
- if (!cp->xmm[idx].dirty ||
- (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
- cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
- cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
- AOS_ERROR(cp, "invalid spill");
- return;
- }
- else {
- struct x86_reg oldval = get_reg_ptr(cp,
- cp->xmm[idx].file,
- cp->xmm[idx].idx);
-
- if (0) debug_printf("\nspill %s[%d]",
- files[cp->xmm[idx].file],
- cp->xmm[idx].idx);
-
- assert(cp->xmm[idx].dirty);
- sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
- cp->xmm[idx].dirty = 0;
- }
-}
-
-
-void aos_spill_all( struct aos_compilation *cp )
-{
- unsigned i;
-
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-}
-
-
-static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file != file_XMM ||
- cp->xmm[reg.idx].file != TGSI_FILE_NULL)
- {
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, reg);
- reg = tmp;
- }
-
- cp->xmm[reg.idx].last_used = cp->insn_counter;
- return reg;
-}
-
-static struct x86_reg get_xmm( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file != file_XMM)
- {
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- sse_movaps(cp->func, tmp, reg);
- reg = tmp;
- }
-
- cp->xmm[reg.idx].last_used = cp->insn_counter;
- return reg;
-}
-
-
-/* Allocate an empty xmm register, either as a temporary or later to
- * "adopt" as a shader reg.
- */
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
-{
- unsigned i;
- unsigned oldest = 0;
- boolean found = FALSE;
-
- for (i = 0; i < 8; i++)
- if (cp->xmm[i].last_used != cp->insn_counter &&
- cp->xmm[i].file == TGSI_FILE_NULL) {
- oldest = i;
- found = TRUE;
- }
-
- if (!found) {
- for (i = 0; i < 8; i++)
- if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
- oldest = i;
- }
-
- /* Need to write out the old value?
- */
- if (cp->xmm[oldest].dirty)
- spill(cp, oldest);
-
- assert(cp->xmm[oldest].last_used != cp->insn_counter);
-
- cp->xmm[oldest].file = TGSI_FILE_NULL;
- cp->xmm[oldest].idx = 0;
- cp->xmm[oldest].dirty = 0;
- cp->xmm[oldest].last_used = cp->insn_counter;
- return x86_make_reg(file_XMM, oldest);
-}
-
-void aos_release_xmm_reg( struct aos_compilation *cp,
- unsigned idx )
-{
- cp->xmm[idx].file = TGSI_FILE_NULL;
- cp->xmm[idx].idx = 0;
- cp->xmm[idx].dirty = 0;
- cp->xmm[idx].last_used = 0;
-}
-
-
-static void aos_soft_release_xmm( struct aos_compilation *cp,
- struct x86_reg reg )
-{
- if (reg.file == file_XMM) {
- assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
- cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
- }
-}
-
-
-
-/* Mark an xmm reg as holding the current copy of a shader reg.
- */
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
- struct x86_reg reg,
- unsigned file,
- unsigned idx,
- unsigned dirty )
-{
- unsigned i;
-
- if (reg.file != file_XMM) {
- assert(0);
- return;
- }
-
-
- /* If any xmm reg thinks it holds this shader reg, break the
- * illusion.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- /* If an xmm reg is already holding this shader reg, take into account its
- * dirty flag...
- */
- dirty |= cp->xmm[i].dirty;
- aos_release_xmm_reg(cp, i);
- }
- }
-
- cp->xmm[reg.idx].file = file;
- cp->xmm[reg.idx].idx = idx;
- cp->xmm[reg.idx].dirty = dirty;
- cp->xmm[reg.idx].last_used = cp->insn_counter;
-}
-
-
-/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
- */
-static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- unsigned i;
-
- /* Ensure the in-memory copy of this reg is up-to-date
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx &&
- cp->xmm[i].dirty) {
- spill(cp, i);
- }
- }
-
- return get_reg_ptr( cp, file, idx );
-}
-
-
-/* As above, but return a pointer. Note - this pointer may alias
- * those returned by get_arg_ptr().
- */
-static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *dst )
-{
- unsigned file = dst->Register.File;
- unsigned idx = dst->Register.Index;
- unsigned i;
-
-
- /* Ensure in-memory copy of this reg is up-to-date and invalidate
- * any xmm copies.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- if (cp->xmm[i].dirty)
- spill(cp, i);
-
- aos_release_xmm_reg(cp, i);
- }
- }
-
- return get_reg_ptr( cp, file, idx );
-}
-
-
-
-
-
-/* Return an XMM reg if the argument is resident, otherwise return a
- * base+offset pointer to the saved value.
- */
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- unsigned i;
-
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].file == file &&
- cp->xmm[i].idx == idx)
- {
- cp->xmm[i].last_used = cp->insn_counter;
- return x86_make_reg(file_XMM, i);
- }
- }
-
- /* If not found in the XMM register file, return an indirect
- * reference to the in-memory copy:
- */
- return get_reg_ptr( cp, file, idx );
-}
-
-
-
-static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
- unsigned file,
- unsigned idx )
-{
- struct x86_reg reg = get_xmm( cp,
- aos_get_shader_reg( cp, file, idx ) );
-
- aos_adopt_xmm_reg( cp,
- reg,
- file,
- idx,
- FALSE );
-
- return reg;
-}
-
-
-
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
- unsigned imm )
-{
- return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
- unsigned imm )
-{
- return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-
-
-
-/* Emulate pshufd insn in regular SSE, if necessary:
- */
-static void emit_pshufd( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg arg0,
- ubyte shuf )
-{
- if (cp->have_sse2) {
- sse2_pshufd(cp->func, dst, arg0, shuf);
- }
- else {
- if (!eq(dst, arg0))
- sse_movaps(cp->func, dst, arg0);
-
- sse_shufps(cp->func, dst, dst, shuf);
- }
-}
-
-/* load masks (pack into negs??)
- * pshufd - shuffle according to writemask
- * and - result, mask
- * nand - dest, mask
- * or - dest, result
- */
-static boolean mask_write( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg result,
- unsigned mask )
-{
- struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, tmp, imm_swz,
- SHUF((mask & 1) ? 2 : 3,
- (mask & 2) ? 2 : 3,
- (mask & 4) ? 2 : 3,
- (mask & 8) ? 2 : 3));
-
- sse_andps(cp->func, dst, tmp);
- sse_andnps(cp->func, tmp, result);
- sse_orps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- return TRUE;
-}
-
-
-
-
-/* Helper for writemask:
- */
-static boolean emit_shuf_copy2( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg arg0,
- struct x86_reg arg1,
- ubyte shuf )
-{
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, dst, arg1, shuf);
- emit_pshufd(cp, tmp, arg0, shuf);
- sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
- emit_pshufd(cp, dst, dst, shuf);
-
- aos_release_xmm_reg(cp, tmp.idx);
- return TRUE;
-}
-
-
-
-#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-
-
-/* Locate a source register and perform any required (simple) swizzle.
- *
- * Just fail on complex swizzles at this point.
- */
-static struct x86_reg fetch_src( struct aos_compilation *cp,
- const struct tgsi_full_src_register *src )
-{
- struct x86_reg arg0 = aos_get_shader_reg(cp,
- src->Register.File,
- src->Register.Index);
- unsigned i;
- ubyte swz = 0;
- unsigned negs = 0;
- unsigned abs = 0;
-
- for (i = 0; i < 4; i++) {
- unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
- unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
-
- swz |= (swizzle & 0x3) << (i * 2);
-
- switch (neg) {
- case TGSI_UTIL_SIGN_TOGGLE:
- negs |= (1<<i);
- break;
-
- case TGSI_UTIL_SIGN_KEEP:
- break;
-
- case TGSI_UTIL_SIGN_CLEAR:
- abs |= (1<<i);
- break;
-
- default:
- AOS_ERROR(cp, "unsupported sign-mode");
- break;
- }
- }
-
- if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
- struct x86_reg dst = aos_get_xmm_reg(cp);
-
- if (swz != SSE_SWIZZLE_NOOP)
- emit_pshufd(cp, dst, arg0, swz);
- else
- sse_movaps(cp->func, dst, arg0);
-
- if (negs && negs != 0xf) {
- struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- /* Load 1,-1,0,0
- * Use neg as arg to pshufd
- * Multiply
- */
- emit_pshufd(cp, tmp, imm_swz,
- SHUF((negs & 1) ? 1 : 0,
- (negs & 2) ? 1 : 0,
- (negs & 4) ? 1 : 0,
- (negs & 8) ? 1 : 0));
- sse_mulps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- aos_soft_release_xmm(cp, imm_swz);
- }
- else if (negs) {
- struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
- sse_mulps(cp->func, dst, imm_negs);
- aos_soft_release_xmm(cp, imm_negs);
- }
-
-
- if (abs && abs != 0xf) {
- AOS_ERROR(cp, "unsupported partial abs");
- }
- else if (abs) {
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, dst);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- aos_soft_release_xmm(cp, neg);
- }
-
- aos_soft_release_xmm(cp, arg0);
- return dst;
- }
-
- return arg0;
-}
-
-static void x87_fld_src( struct aos_compilation *cp,
- const struct tgsi_full_src_register *src,
- unsigned channel )
-{
- struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
- src->Register.File,
- src->Register.Index);
-
- unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
- unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
-
- x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
-
- switch (neg) {
- case TGSI_UTIL_SIGN_TOGGLE:
- /* Flip the sign:
- */
- x87_fchs( cp->func );
- break;
-
- case TGSI_UTIL_SIGN_KEEP:
- break;
-
- case TGSI_UTIL_SIGN_CLEAR:
- x87_fabs( cp->func );
- break;
-
- case TGSI_UTIL_SIGN_SET:
- x87_fabs( cp->func );
- x87_fchs( cp->func );
- break;
-
- default:
- AOS_ERROR(cp, "unsupported sign-mode");
- break;
- }
-}
-
-
-
-
-
-
-/* Used to implement write masking. This and most of the other instructions
- * here would be easier to implement if there had been a translation
- * to a 2 argument format (dst/arg0, arg1) at the shader level before
- * attempting to translate to x86/sse code.
- */
-static void store_dest( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *reg,
- struct x86_reg result )
-{
- struct x86_reg dst;
-
- switch (reg->Register.WriteMask) {
- case 0:
- return;
-
- case TGSI_WRITEMASK_XYZW:
- aos_adopt_xmm_reg(cp,
- get_xmm_writable(cp, result),
- reg->Register.File,
- reg->Register.Index,
- TRUE);
- return;
- default:
- break;
- }
-
- dst = aos_get_shader_reg_xmm(cp,
- reg->Register.File,
- reg->Register.Index);
-
- switch (reg->Register.WriteMask) {
- case TGSI_WRITEMASK_X:
- sse_movss(cp->func, dst, get_xmm(cp, result));
- break;
-
- case TGSI_WRITEMASK_ZW:
- sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
- break;
-
- case TGSI_WRITEMASK_XY:
- result = get_xmm_writable(cp, result);
- sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
- dst = result;
- break;
-
- case TGSI_WRITEMASK_YZW:
- result = get_xmm_writable(cp, result);
- sse_movss(cp->func, result, dst);
- dst = result;
- break;
-
- default:
- mask_write(cp, dst, result, reg->Register.WriteMask);
- break;
- }
-
- aos_adopt_xmm_reg(cp,
- dst,
- reg->Register.File,
- reg->Register.Index,
- TRUE);
-
-}
-
-static void inject_scalar( struct aos_compilation *cp,
- struct x86_reg dst,
- struct x86_reg result,
- ubyte swizzle )
-{
- sse_shufps(cp->func, dst, dst, swizzle);
- sse_movss(cp->func, dst, result);
- sse_shufps(cp->func, dst, dst, swizzle);
-}
-
-
-static void store_scalar_dest( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *reg,
- struct x86_reg result )
-{
- unsigned writemask = reg->Register.WriteMask;
- struct x86_reg dst;
-
- if (writemask != TGSI_WRITEMASK_X &&
- writemask != TGSI_WRITEMASK_Y &&
- writemask != TGSI_WRITEMASK_Z &&
- writemask != TGSI_WRITEMASK_W &&
- writemask != 0)
- {
- result = get_xmm_writable(cp, result); /* already true, right? */
- sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
- store_dest(cp, reg, result);
- return;
- }
-
- result = get_xmm(cp, result);
- dst = aos_get_shader_reg_xmm(cp,
- reg->Register.File,
- reg->Register.Index);
-
-
-
- switch (reg->Register.WriteMask) {
- case TGSI_WRITEMASK_X:
- sse_movss(cp->func, dst, result);
- break;
-
- case TGSI_WRITEMASK_Y:
- inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
- break;
-
- case TGSI_WRITEMASK_Z:
- inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
- break;
-
- case TGSI_WRITEMASK_W:
- inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
- break;
-
- default:
- break;
- }
-
- aos_adopt_xmm_reg(cp,
- dst,
- reg->Register.File,
- reg->Register.Index,
- TRUE);
-}
-
-
-
-static void x87_fst_or_nop( struct x86_function *func,
- unsigned writemask,
- unsigned channel,
- struct x86_reg ptr )
-{
- assert(ptr.file == file_REG32);
- if (writemask & (1<<channel))
- x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
-}
-
-static void x87_fstp_or_pop( struct x86_function *func,
- unsigned writemask,
- unsigned channel,
- struct x86_reg ptr )
-{
- assert(ptr.file == file_REG32);
- if (writemask & (1<<channel))
- x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
- else
- x87_fstp( func, x86_make_reg( file_x87, 0 ));
-}
-
-
-
-/*
- */
-static void x87_fstp_dest4( struct aos_compilation *cp,
- const struct tgsi_full_dst_register *dst )
-{
- struct x86_reg ptr = get_dst_ptr(cp, dst);
- unsigned writemask = dst->Register.WriteMask;
-
- x87_fst_or_nop(cp->func, writemask, 0, ptr);
- x87_fst_or_nop(cp->func, writemask, 1, ptr);
- x87_fst_or_nop(cp->func, writemask, 2, ptr);
- x87_fstp_or_pop(cp->func, writemask, 3, ptr);
-}
-
-/* Save current x87 state and put it into single precision mode.
- */
-static void save_fpu_state( struct aos_compilation *cp )
-{
- x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_restore)));
-}
-
-static void restore_fpu_state( struct aos_compilation *cp )
-{
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_restore)));
-}
-
-static void set_fpu_round_neg_inf( struct aos_compilation *cp )
-{
- if (cp->fpucntl != FPU_RND_NEG) {
- cp->fpucntl = FPU_RND_NEG;
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_rnd_neg_inf)));
- }
-}
-
-static void set_fpu_round_nearest( struct aos_compilation *cp )
-{
- if (cp->fpucntl != FPU_RND_NEAREST) {
- cp->fpucntl = FPU_RND_NEAREST;
- x87_fnclex(cp->func);
- x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, fpu_rnd_nearest)));
- }
-}
-
-#if 0
-static void x87_emit_ex2( struct aos_compilation *cp )
-{
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- int stack = cp->func->x87_stack;
-
- /* set_fpu_round_neg_inf( cp ); */
-
- x87_fld(cp->func, st0); /* a a */
- x87_fprndint( cp->func ); /* int(a) a*/
- x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
- x87_fxch(cp->func, st1); /* frc(a) int(a) */
- x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
- x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
- x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
- x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
- /* 2^a int(a) */
- x87_fstp(cp->func, st1); /* 2^a */
-
- assert( stack == cp->func->x87_stack);
-
-}
-#endif
-
-#if 0
-static void PIPE_CDECL print_reg( const char *msg,
- const float *reg )
-{
- debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
-}
-#endif
-
-#if 0
-static void emit_print( struct aos_compilation *cp,
- const char *message, /* must point to a static string! */
- unsigned file,
- unsigned idx )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
- unsigned i;
-
- /* There shouldn't be anything on the x87 stack. Can add this
- * capacity later if need be.
- */
- assert(cp->func->x87_stack == 0);
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too. We're obviously not concerned about performance on this
- * debug path, so here goes:
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
-
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
-
- /* Push the arguments:
- */
- x86_lea( cp->func, ecx, arg );
- x86_push( cp->func, ecx );
- x86_push_imm32( cp->func, (int)message );
-
- /* Call the helper. Could call debug_printf directly, but
- * print_reg is a nice place to put a breakpoint if need be.
- */
- x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
- x86_call( cp->func, ecx );
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
-
- /* Pop caller-save regs
- */
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Done...
- */
-}
-#endif
-
-/**
- * The traditional instructions. All operate on internal registers
- * and ignore write masks and swizzling issues.
- */
-
-static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, arg0);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, tmp, arg0);
-
- store_dest(cp, &op->Dst[0], tmp);
- return TRUE;
-}
-
-static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_addps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_fcos(cp->func);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-/* The dotproduct instructions don't really do that well in sse:
- * XXX: produces wrong results -- disabled.
- */
-static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
- /* Now the hard bit: sum the first 3 values:
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- /* Now the hard bit: sum the values:
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- /* Now the hard bit: sum the values (from DP3):
- */
- sse_movhlps(cp->func, tmp, dst);
- sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
- emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
- sse_addss(cp->func, dst, tmp);
- emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
- sse_addss(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-
-/* dst[0] = 1.0 * 1.0F; */
-/* dst[1] = arg0[1] * arg1[1]; */
-/* dst[2] = arg0[2] * 1.0; */
-/* dst[3] = 1.0 * arg1[3]; */
-
- emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
- emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
- sse_mulps(cp->func, dst, tmp);
-
- aos_release_xmm_reg(cp, tmp.idx);
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld1(cp->func); /* 1 */
- x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */
- x87_fyl2x(cp->func); /* log2(a0) */
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-#if 0
-static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_emit_ex2(cp);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-#endif
-
-
-static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_neg_inf( cp );
-
- /* Load all sources first to avoid aliasing
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fprndint( cp->func );
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_nearest( cp );
-
- /* Load all sources first to avoid aliasing
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fprndint( cp->func );
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- struct x86_reg st0 = x86_make_reg(file_x87, 0);
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- unsigned writemask = op->Dst[0].Register.WriteMask;
- int i;
-
- set_fpu_round_neg_inf( cp );
-
- /* suck all the source values onto the stack before writing out any
- * dst, which may alias...
- */
- for (i = 3; i >= 0; i--) {
- if (writemask & (1<<i)) {
- x87_fld_src(cp, &op->Src[0], i);
- }
- }
-
- for (i = 0; i < 4; i++) {
- if (writemask & (1<<i)) {
- x87_fld(cp->func, st0); /* a a */
- x87_fprndint( cp->func ); /* flr(a) a */
- x87_fsubp(cp->func, st1); /* frc(a) */
- x87_fstp(cp->func, x86_make_disp(dst, i*4));
- }
- }
-
- return TRUE;
-}
-
-
-
-
-
-
-static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- unsigned writemask = op->Dst[0].Register.WriteMask;
- unsigned lit_count = cp->lit_count++;
- struct x86_reg result, arg0;
- unsigned i;
-
-#if 1
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-#endif
-
- if (writemask != TGSI_WRITEMASK_XYZW)
- result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
- else
- result = get_dst_ptr(cp, &op->Dst[0]);
-
-
- arg0 = fetch_src( cp, &op->Src[0] );
- if (arg0.file == file_XMM) {
- struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, tmp[1]));
- sse_movaps( cp->func, tmp, arg0 );
- arg0 = tmp;
- }
-
-
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- /* Push the arguments:
- */
- x86_push_imm32( cp->func, lit_count );
-
- x86_lea( cp->func, ecx, arg0 );
- x86_push( cp->func, ecx );
-
- x86_lea( cp->func, ecx, result );
- x86_push( cp->func, ecx );
-
- x86_push( cp->func, cp->machine_EDX );
-
- if (lit_count < MAX_LIT_INFO) {
- x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
- Offset(struct aos_machine, lit_info) +
- lit_count * sizeof(struct lit_info) +
- Offset(struct lit_info, func)));
- }
- else {
- x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
- }
-
- x86_call( cp->func, ecx );
-
- x86_pop( cp->func, ecx ); /* fixme... */
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
- x86_pop( cp->func, ecx );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- if (writemask != TGSI_WRITEMASK_XYZW) {
- store_dest( cp,
- &op->Dst[0],
- get_xmm_writable( cp, result ) );
- }
-
- return TRUE;
-}
-
-#if 0
-static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
- unsigned writemask = op->Dst[0].Register.WriteMask;
-
- if (writemask & TGSI_WRITEMASK_YZ) {
- struct x86_reg st1 = x86_make_reg(file_x87, 1);
- struct x86_reg st2 = x86_make_reg(file_x87, 2);
-
- /* a1' = a1 <= 0 ? 1 : a1;
- */
- x87_fldz(cp->func); /* 1 0 */
-#if 1
- x87_fld1(cp->func); /* 1 0 */
-#else
- /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
- */
- x87_fldz(cp->func); /* 1 0 */
-#endif
- x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */
- x87_fcomi(cp->func, st2); /* a1 1 0 */
- x87_fcmovb(cp->func, st1); /* a1' 1 0 */
- x87_fstp(cp->func, st1); /* a1' 0 */
- x87_fstp(cp->func, st1); /* a1' */
-
- x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */
- x87_fxch(cp->func, st1); /* a1' a3 */
-
-
- /* Compute pow(a1, a3)
- */
- x87_fyl2x(cp->func); /* a3*log2(a1) */
- x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
-
-
- /* a0' = max2(a0, 0):
- */
- x87_fldz(cp->func); /* 0 r2 */
- x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
- x87_fcomi(cp->func, st1);
- x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
-
- x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
-
- x87_fcomi(cp->func, st1); /* a0' 0 r2 */
- x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
-
- x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
- x87_fpop(cp->func); /* r2 */
- x87_fpop(cp->func);
- }
-
- if (writemask & TGSI_WRITEMASK_XW) {
- x87_fld1(cp->func);
- x87_fst_or_nop(cp->func, writemask, 0, dst);
- x87_fstp_or_pop(cp->func, writemask, 3, dst);
- }
-
- return TRUE;
-}
-#endif
-
-
-
-static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_maxps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_minps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- /* potentially nothing to do */
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
-
- /* If we can't clobber old contents of arg0, get a temporary & copy
- * it there, then clobber it...
- */
- arg0 = get_xmm_writable(cp, arg0);
-
- sse_mulps(cp->func, arg0, arg1);
- sse_addps(cp->func, arg0, arg2);
- store_dest(cp, &op->Dst[0], arg0);
- return TRUE;
-}
-
-
-
-/* A wrapper for powf().
- * Makes sure it is cdecl and operates on floats.
- */
-static float PIPE_CDECL _powerf( float x, float y )
-{
-#if FAST_MATH
- return util_fast_pow(x, y);
-#else
- return powf( x, y );
-#endif
-}
-
-#if FAST_MATH
-static float PIPE_CDECL _exp2(float x)
-{
- return util_fast_exp2(x);
-}
-#endif
-
-
-/* Really not sufficient -- need to check for conditions that could
- * generate inf/nan values, which will slow things down hugely.
- */
-static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-#if 0
- x87_fld_src(cp, &op->Src[1], 0); /* a1.x */
- x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */
- x87_fyl2x(cp->func); /* a1*log2(a0) */
-
- x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
-
- x87_fstp_dest4(cp, &op->Dst[0]);
-#else
- uint i;
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
-
- x87_fld_src( cp, &op->Src[1], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
- x87_fld_src( cp, &op->Src[0], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
- /* tmp_EAX has been pushed & will be restored below */
- x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
- x86_call( cp->func, cp->tmp_EAX );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Note retval on x87 stack:
- */
- cp->func->x87_stack++;
-
- x87_fstp_dest4( cp, &op->Dst[0] );
-#endif
- return TRUE;
-}
-
-
-#if FAST_MATH
-static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- uint i;
-
- /* For absolute correctness, need to spill/invalidate all XMM regs
- * too.
- */
- for (i = 0; i < 8; i++) {
- if (cp->xmm[i].dirty)
- spill(cp, i);
- aos_release_xmm_reg(cp, i);
- }
-
- /* Push caller-save (ie scratch) regs.
- */
- x86_cdecl_caller_push_regs( cp->func );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
-
- x87_fld_src( cp, &op->Src[0], 0 );
- x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
- /* tmp_EAX has been pushed & will be restored below */
- x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
- x86_call( cp->func, cp->tmp_EAX );
-
- x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
-
- x86_cdecl_caller_pop_regs( cp->func );
-
- /* Note retval on x87 stack:
- */
- cp->func->x87_stack++;
-
- x87_fstp_dest4( cp, &op->Dst[0] );
-
- return TRUE;
-}
-#endif
-
-
-static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg dst = aos_get_xmm_reg(cp);
-
- if (cp->have_sse2) {
- sse2_rcpss(cp->func, dst, arg0);
- /* extend precision here...
- */
- }
- else {
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- sse_movss(cp->func, dst, ones);
- sse_divss(cp->func, dst, arg0);
- }
-
- store_scalar_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-
-/* Although rsqrtps() and rcpps() are low precision on some/all SSE
- * implementations, it is possible to improve its precision at
- * fairly low cost, using a newton/raphson step, as below:
- *
- * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
- * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
- * or:
- * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
- *
- *
- * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
- */
-static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- if (0) {
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg r = aos_get_xmm_reg(cp);
- sse_rsqrtss(cp->func, r, arg0);
- store_scalar_dest(cp, &op->Dst[0], r);
- return TRUE;
- }
- else {
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg r = aos_get_xmm_reg(cp);
-
- struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
- struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
- struct x86_reg src = get_xmm_writable( cp, arg0 );
- struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
- struct x86_reg tmp = aos_get_xmm_reg(cp);
-
- sse_movaps(cp->func, tmp, src);
- sse_mulps(cp->func, tmp, neg);
- sse_maxps(cp->func, tmp, src);
-
- sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
- sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
- sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
- sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
- sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
- sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
-
- store_scalar_dest(cp, &op->Dst[0], r);
-
- aos_release_xmm_reg(cp, tmp.idx);
-
- return TRUE;
- }
-}
-
-
-static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
- sse_andps(cp->func, dst, ones);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- x87_fld_src(cp, &op->Src[0], 0);
- x87_fsin(cp->func);
- x87_fstp_dest4(cp, &op->Dst[0]);
- return TRUE;
-}
-
-
-
-static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_cmpps(cp->func, dst, arg1, cc_LessThan);
- sse_andps(cp->func, dst, ones);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg dst = get_xmm_writable(cp, arg0);
-
- sse_subps(cp->func, dst, arg1);
-
- store_dest(cp, &op->Dst[0], dst);
- return TRUE;
-}
-
-static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg tmp0 = aos_get_xmm_reg(cp);
-
- sse2_cvttps2dq(cp->func, tmp0, arg0);
- sse2_cvtdq2ps(cp->func, tmp0, tmp0);
-
- store_dest(cp, &op->Dst[0], tmp0);
- return TRUE;
-}
-
-static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
- struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
- struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
- struct x86_reg tmp0 = aos_get_xmm_reg(cp);
- struct x86_reg tmp1 = aos_get_xmm_reg(cp);
-
- emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
- sse_mulps(cp->func, tmp1, arg0);
- emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
- sse_mulps(cp->func, tmp0, arg1);
- sse_subps(cp->func, tmp1, tmp0);
- sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
-
-/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
-/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
-/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
-/* dst[3] is undef */
-
-
- aos_release_xmm_reg(cp, tmp0.idx);
- store_dest(cp, &op->Dst[0], tmp1);
- return TRUE;
-}
-
-
-
-static boolean
-emit_instruction( struct aos_compilation *cp,
- struct tgsi_full_instruction *inst )
-{
- x87_assert_stack_empty(cp->func);
-
- switch( inst->Instruction.Opcode ) {
- case TGSI_OPCODE_MOV:
- return emit_MOV( cp, inst );
-
- case TGSI_OPCODE_LIT:
- return emit_LIT(cp, inst);
-
- case TGSI_OPCODE_RCP:
- return emit_RCP(cp, inst);
-
- case TGSI_OPCODE_RSQ:
- return emit_RSQ(cp, inst);
-
- case TGSI_OPCODE_EXP:
- /*return emit_EXP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_LOG:
- /*return emit_LOG(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_MUL:
- return emit_MUL(cp, inst);
-
- case TGSI_OPCODE_ADD:
- return emit_ADD(cp, inst);
-
- case TGSI_OPCODE_DP3:
- return emit_DP3(cp, inst);
-
- case TGSI_OPCODE_DP4:
- return emit_DP4(cp, inst);
-
- case TGSI_OPCODE_DST:
- return emit_DST(cp, inst);
-
- case TGSI_OPCODE_MIN:
- return emit_MIN(cp, inst);
-
- case TGSI_OPCODE_MAX:
- return emit_MAX(cp, inst);
-
- case TGSI_OPCODE_SLT:
- return emit_SLT(cp, inst);
-
- case TGSI_OPCODE_SGE:
- return emit_SGE(cp, inst);
-
- case TGSI_OPCODE_MAD:
- return emit_MAD(cp, inst);
-
- case TGSI_OPCODE_SUB:
- return emit_SUB(cp, inst);
-
- case TGSI_OPCODE_LRP:
- /*return emit_LERP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_FRC:
- return emit_FRC(cp, inst);
-
- case TGSI_OPCODE_CLAMP:
- /*return emit_CLAMP(cp, inst);*/
- return FALSE;
-
- case TGSI_OPCODE_FLR:
- return emit_FLR(cp, inst);
-
- case TGSI_OPCODE_ROUND:
- return emit_RND(cp, inst);
-
- case TGSI_OPCODE_EX2:
-#if FAST_MATH
- return emit_EXPBASE2(cp, inst);
-#elif 0
- /* this seems to fail for "larger" exponents.
- * See glean tvertProg1's EX2 test.
- */
- return emit_EX2(cp, inst);
-#else
- return FALSE;
-#endif
-
- case TGSI_OPCODE_LG2:
- return emit_LG2(cp, inst);
-
- case TGSI_OPCODE_POW:
- return emit_POW(cp, inst);
-
- case TGSI_OPCODE_XPD:
- return emit_XPD(cp, inst);
-
- case TGSI_OPCODE_ABS:
- return emit_ABS(cp, inst);
-
- case TGSI_OPCODE_DPH:
- return emit_DPH(cp, inst);
-
- case TGSI_OPCODE_COS:
- return emit_COS(cp, inst);
-
- case TGSI_OPCODE_SIN:
- return emit_SIN(cp, inst);
-
- case TGSI_OPCODE_TRUNC:
- return emit_TRUNC(cp, inst);
-
- case TGSI_OPCODE_END:
- return TRUE;
-
- default:
- return FALSE;
- }
-}
-
-
-static boolean emit_viewport( struct aos_compilation *cp )
-{
- struct x86_reg pos = aos_get_shader_reg_xmm(cp,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output );
-
- struct x86_reg scale = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, scale));
-
- struct x86_reg translate = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, translate));
-
- sse_mulps(cp->func, pos, scale);
- sse_addps(cp->func, pos, translate);
-
- aos_adopt_xmm_reg( cp,
- pos,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output,
- TRUE );
- return TRUE;
-}
-
-
-/* This is useful to be able to see the results on softpipe. Doesn't
- * do proper clipping, just assumes the backend can do it during
- * rasterization -- for debug only...
- */
-static boolean emit_rhw_viewport( struct aos_compilation *cp )
-{
- struct x86_reg tmp = aos_get_xmm_reg(cp);
- struct x86_reg pos = aos_get_shader_reg_xmm(cp,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output);
-
- struct x86_reg scale = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, scale));
-
- struct x86_reg translate = x86_make_disp(cp->machine_EDX,
- Offset(struct aos_machine, translate));
-
-
-
- emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
- sse2_rcpss(cp->func, tmp, tmp);
- sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
-
- sse_mulps(cp->func, pos, scale);
- sse_mulps(cp->func, pos, tmp);
- sse_addps(cp->func, pos, translate);
-
- /* Set pos[3] = w
- */
- mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
-
- aos_adopt_xmm_reg( cp,
- pos,
- TGSI_FILE_OUTPUT,
- cp->vaos->draw->vs.position_output,
- TRUE );
- return TRUE;
-}
-
-
-#if 0
-static boolean note_immediate( struct aos_compilation *cp,
- struct tgsi_full_immediate *imm )
-{
- unsigned pos = cp->num_immediates++;
- unsigned j;
-
- assert( imm->Immediate.NrTokens <= 4 + 1 );
- for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
- cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
- }
-
- return TRUE;
-}
-#endif
-
-
-
-
-static void find_last_write_outputs( struct aos_compilation *cp )
-{
- struct tgsi_parse_context parse;
- unsigned this_instruction = 0;
- unsigned i;
-
- tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
-
- while (!tgsi_parse_end_of_tokens( &parse )) {
-
- tgsi_parse_token( &parse );
-
- if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
- continue;
-
- for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
- if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
- TGSI_FILE_OUTPUT)
- {
- unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
- cp->output_last_write[idx] = this_instruction;
- }
- }
-
- this_instruction++;
- }
-
- tgsi_parse_free( &parse );
-}
-
-
-#define ARG_MACHINE 1
-#define ARG_START_ELTS 2
-#define ARG_COUNT 3
-#define ARG_OUTBUF 4
-
-
-static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant,
- boolean linear )
-{
- struct tgsi_parse_context parse;
- struct aos_compilation cp;
- unsigned fixup, label;
-
- util_init_math();
-
- tgsi_parse_init( &parse, variant->base.vs->state.tokens );
-
- memset(&cp, 0, sizeof(cp));
-
- cp.insn_counter = 1;
- cp.vaos = variant;
- cp.have_sse2 = 1;
- cp.func = &variant->func[ linear ? 0 : 1 ];
-
- cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
- cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
- cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
- cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
- cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
- cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
- cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
-
- x86_init_func(cp.func);
-
- find_last_write_outputs(&cp);
-
- x86_push(cp.func, cp.idx_EBX);
- x86_push(cp.func, cp.count_ESI);
- x86_push(cp.func, cp.temp_EBP);
-
-
- /* Load arguments into regs:
- */
- x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
- x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
- x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
- x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
-
-
- /* Compare count to zero and possibly bail.
- */
- x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
- x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
- fixup = x86_jcc_forward(cp.func, cc_E);
-
-
- save_fpu_state( &cp );
- set_fpu_round_nearest( &cp );
-
- aos_init_inputs( &cp, linear );
-
- cp.x86_reg[0] = 0;
- cp.x86_reg[1] = 0;
-
- /* Note address for loop jump
- */
- label = x86_get_label(cp.func);
- {
- /* Fetch inputs... TODO: fetch lazily...
- */
- if (!aos_fetch_inputs( &cp, linear ))
- goto fail;
-
- /* Emit the shader:
- */
- while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
- {
- tgsi_parse_token( &parse );
-
- switch (parse.FullToken.Token.Type) {
- case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
- if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
- goto fail;
-#endif
- break;
-
- case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (DISASSEM)
- tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
-
- if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
- goto fail;
- break;
- }
-
- x87_assert_stack_empty(cp.func);
- cp.insn_counter++;
-
- if (DISASSEM)
- debug_printf("\n");
- }
-
-
- {
- unsigned i;
- for (i = 0; i < 8; i++) {
- if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
- cp.xmm[i].file = TGSI_FILE_NULL;
- cp.xmm[i].dirty = 0;
- }
- }
- }
-
- if (cp.error)
- goto fail;
-
- if (cp.vaos->base.key.clip) {
- /* not really handling clipping, just do the rhw so we can
- * see the results...
- */
- emit_rhw_viewport(&cp);
- }
- else if (cp.vaos->base.key.viewport) {
- emit_viewport(&cp);
- }
-
- /* Emit output... TODO: do this eagerly after the last write to a
- * given output.
- */
- if (!aos_emit_outputs( &cp ))
- goto fail;
-
-
- /* Next vertex:
- */
- x86_lea(cp.func,
- cp.outbuf_ECX,
- x86_make_disp(cp.outbuf_ECX,
- cp.vaos->base.key.output_stride));
-
- /* Incr index
- */
- aos_incr_inputs( &cp, linear );
- }
- /* decr count, loop if not zero
- */
- x86_dec(cp.func, cp.count_ESI);
- x86_jcc(cp.func, cc_NZ, label);
-
- restore_fpu_state(&cp);
-
- /* Land forward jump here:
- */
- x86_fixup_fwd_jump(cp.func, fixup);
-
- /* Exit mmx state?
- */
- if (cp.func->need_emms)
- mmx_emms(cp.func);
-
- x86_pop(cp.func, cp.temp_EBP);
- x86_pop(cp.func, cp.count_ESI);
- x86_pop(cp.func, cp.idx_EBX);
-
- x87_assert_stack_empty(cp.func);
- x86_ret(cp.func);
-
- tgsi_parse_free( &parse );
- return !cp.error;
-
- fail:
- tgsi_parse_free( &parse );
- return FALSE;
-}
-
-
-/** cast wrapper */
-static INLINE struct draw_vs_variant_aos_sse *
-draw_vs_variant_aos_sse(struct draw_vs_variant *variant)
-{
- return (struct draw_vs_variant_aos_sse *) variant;
-}
-
-
-static void vaos_set_buffer( struct draw_vs_variant *variant,
- unsigned buf,
- const void *ptr,
- unsigned stride,
- unsigned max_stride)
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
- if (buf < vaos->nr_vb) {
- vaos->buffer[buf].base_ptr = (char *)ptr;
- vaos->buffer[buf].stride = stride;
- }
-
- if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
-}
-
-
-
-static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant,
- const unsigned *elts,
- unsigned count,
- void *output_buffer )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
- struct aos_machine *machine = vaos->draw->vs.aos_machine;
- unsigned i;
-
- if (0) debug_printf("%s %d\n", __FUNCTION__, count);
-
- machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
- for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
- machine->constants[i] = vaos->draw->vs.aligned_constants[i];
- }
- machine->immediates = vaos->base.vs->immediates;
- machine->buffer = vaos->buffer;
-
- vaos->gen_run_elts( machine,
- elts,
- count,
- output_buffer );
-}
-
-static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant,
- unsigned start,
- unsigned count,
- void *output_buffer )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
- struct aos_machine *machine = vaos->draw->vs.aos_machine;
- unsigned i;
-
- if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
- vaos->base.key.const_vbuffers);
-
- machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
- for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
- machine->constants[i] = vaos->draw->vs.aligned_constants[i];
- }
- machine->immediates = vaos->base.vs->immediates;
- machine->buffer = vaos->buffer;
-
- vaos->gen_run_linear( machine,
- start,
- count,
- output_buffer );
-
- /* Sanity spot checks to make sure we didn't trash our constants */
- assert(machine->internal[IMM_ONES][0] == 1.0f);
- assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
- assert(machine->internal[IMM_NEGS][0] == -1.0f);
-}
-
-
-
-static void vaos_destroy( struct draw_vs_variant *variant )
-{
- struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
- FREE( vaos->buffer );
-
- x86_release_func( &vaos->func[0] );
- x86_release_func( &vaos->func[1] );
-
- FREE(vaos);
-}
-
-
-
-static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key )
-{
- unsigned i;
- struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse);
-
- if (!vaos)
- goto fail;
-
- vaos->base.key = *key;
- vaos->base.vs = vs;
- vaos->base.set_buffer = vaos_set_buffer;
- vaos->base.destroy = vaos_destroy;
- vaos->base.run_linear = vaos_run_linear;
- vaos->base.run_elts = vaos_run_elts;
-
- vaos->draw = vs->draw;
-
- for (i = 0; i < key->nr_inputs; i++)
- vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
-
- vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
- if (!vaos->buffer)
- goto fail;
-
- if (0)
- debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
-
-#if 0
- tgsi_dump(vs->state.tokens, 0);
-#endif
-
- if (!build_vertex_program( vaos, TRUE ))
- goto fail;
-
- if (!build_vertex_program( vaos, FALSE ))
- goto fail;
-
- vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
- if (!vaos->gen_run_linear)
- goto fail;
-
- vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
- if (!vaos->gen_run_elts)
- goto fail;
-
- return &vaos->base;
-
- fail:
- if (vaos && vaos->buffer)
- FREE(vaos->buffer);
-
- if (vaos)
- x86_release_func( &vaos->func[0] );
-
- if (vaos)
- x86_release_func( &vaos->func[1] );
-
- FREE(vaos);
-
- return NULL;
-}
-
-
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
- const struct draw_vs_variant_key *key )
-{
- struct draw_vs_variant *variant = variant_aos_sse( vs, key );
-
- if (variant == NULL) {
- variant = draw_vs_create_variant_generic( vs, key );
- }
-
- return variant;
-}
-
-
-
-#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
deleted file mode 100644
index 55e63d8b9fa..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/* Authors: Keith Whitwell <[email protected]>
- */
-
-#ifndef DRAW_VS_AOS_H
-#define DRAW_VS_AOS_H
-
-#include "pipe/p_config.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-
-#ifdef PIPE_ARCH_X86
-
-struct tgsi_token;
-struct x86_function;
-
-#include "pipe/p_state.h"
-#include "rtasm/rtasm_x86sse.h"
-
-
-
-
-
-#define X 0
-#define Y 1
-#define Z 2
-#define W 3
-
-#define MAX_INPUTS PIPE_MAX_ATTRIBS
-#define MAX_OUTPUTS PIPE_MAX_SHADER_OUTPUTS
-#define MAX_TEMPS TGSI_EXEC_NUM_TEMPS
-#define MAX_CONSTANTS 1024 /** only used for sanity checking */
-#define MAX_IMMEDIATES 1024 /** only used for sanity checking */
-#define MAX_INTERNALS 8 /** see IMM_x values below */
-
-#define AOS_FILE_INTERNAL TGSI_FILE_COUNT
-
-#define FPU_RND_NEG 1
-#define FPU_RND_NEAREST 2
-
-struct aos_machine;
-typedef void (PIPE_CDECL *lit_func)( struct aos_machine *,
- float *result,
- const float *in,
- unsigned count );
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count );
-
-struct shine_tab {
- float exponent;
- float values[258];
- unsigned last_used;
-};
-
-struct lit_info {
- lit_func func;
- struct shine_tab *shine_tab;
-};
-
-#define MAX_SHINE_TAB 4
-#define MAX_LIT_INFO 16
-
-struct aos_buffer {
- const void *base_ptr;
- unsigned stride;
- void *ptr; /* updated per vertex */
-};
-
-
-
-
-/* This is the temporary storage used by all the aos_sse vs variants.
- * Create one per context and reuse by passing a pointer in at
- * vs_variant creation??
- */
-struct aos_machine {
- float input [MAX_INPUTS ][4];
- float output [MAX_OUTPUTS ][4];
- float temp [MAX_TEMPS ][4];
- float internal [MAX_INTERNALS ][4];
-
- float scale[4]; /* viewport */
- float translate[4]; /* viewport */
-
- float tmp[2][4]; /* scratch space for LIT */
-
- struct shine_tab shine_tab[MAX_SHINE_TAB];
- struct lit_info lit_info[MAX_LIT_INFO];
- unsigned now;
-
-
- ushort fpu_rnd_nearest;
- ushort fpu_rnd_neg_inf;
- ushort fpu_restore;
- ushort fpucntl; /* one of FPU_* above */
-
- const float (*immediates)[4]; /* points to shader data */
- const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */
-
- const struct aos_buffer *buffer; /* points to ? */
-};
-
-
-
-
-struct aos_compilation {
- struct x86_function *func;
- struct draw_vs_variant_aos_sse *vaos;
-
- unsigned insn_counter;
- unsigned num_immediates;
- unsigned count;
- unsigned lit_count;
-
- struct {
- unsigned idx:16;
- unsigned file:8;
- unsigned dirty:8;
- unsigned last_used;
- } xmm[8];
-
- unsigned x86_reg[2]; /* one of X86_* */
-
- boolean input_fetched[PIPE_MAX_ATTRIBS];
- unsigned output_last_write[PIPE_MAX_ATTRIBS];
-
- boolean have_sse2;
- boolean error;
- short fpucntl;
-
- /* these are actually known values, but putting them in a struct
- * like this is helpful to keep them in sync across the file.
- */
- struct x86_reg tmp_EAX;
- struct x86_reg idx_EBX; /* either start+i or &elt[i] */
- struct x86_reg outbuf_ECX;
- struct x86_reg machine_EDX;
- struct x86_reg count_ESI; /* decrements to zero */
- struct x86_reg temp_EBP;
- struct x86_reg stack_ESP;
-};
-
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp );
-void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx );
-
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
- struct x86_reg reg,
- unsigned file,
- unsigned idx,
- unsigned dirty );
-
-void aos_spill_all( struct aos_compilation *cp );
-
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
- unsigned file,
- unsigned idx );
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
-
-boolean aos_emit_outputs( struct aos_compilation *cp );
-
-
-#define IMM_ONES 0 /* 1, 1,1,1 */
-#define IMM_SWZ 1 /* 1,-1,0, 0xffffffff */
-#define IMM_IDENTITY 2 /* 0, 0,0,1 */
-#define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */
-#define IMM_255 4 /* 255, 255, 255, 255 */
-#define IMM_NEGS 5 /* -1,-1,-1,-1 */
-#define IMM_RSQ 6 /* -.5,1.5,_,_ */
-#define IMM_PSIZE 7 /* not really an immediate - updated each run */
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
- unsigned imm );
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
- unsigned imm );
-
-
-#define AOS_ERROR(cp, msg) \
-do { \
- if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \
- cp->error = 1; \
-} while (0)
-
-
-#define X86_NULL 0
-#define X86_IMMEDIATES 1
-#define X86_CONSTANTS 2
-#define X86_BUFFERS 3
-
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
- unsigned which_reg,
- unsigned value );
-
-
-typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *,
- const unsigned *elts,
- unsigned count,
- void *output_buffer);
-
-typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *,
- unsigned start,
- unsigned count,
- void *output_buffer);
-
-
-struct draw_vs_variant_aos_sse {
- struct draw_vs_variant base;
- struct draw_context *draw;
-
- struct aos_buffer *buffer;
- unsigned nr_vb;
-
- vaos_run_linear_func gen_run_linear;
- vaos_run_elts_func gen_run_elts;
-
-
- struct x86_function func[2];
-};
-
-
-#endif
-
-#endif
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
deleted file mode 100644
index f1dd4487732..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-
-/* Note - don't yet have to worry about interacting with the code in
- * draw_vs_aos.c as there is no intermingling of generated code...
- * That may have to change, we'll see.
- */
-static void emit_load_R32G32B32A32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movups(cp->func, data, src_ptr);
-}
-
-static void emit_load_R32G32B32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
-#if 1
- sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
- /* data = z ? ? ? */
- sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
- /* data = z ? 0 1 */
- sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
- /* data = ? 0 z 1 */
- sse_movlps(cp->func, data, src_ptr);
- /* data = x y z 1 */
-#else
- sse_movups(cp->func, data, src_ptr);
- /* data = x y z ? */
- sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
- /* data = ? x y z */
- sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
- /* data = 1 x y z */
- sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
- /* data = x y z 1 */
-#endif
-}
-
-static void emit_load_R32G32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
- sse_movlps(cp->func, data, src_ptr);
-}
-
-
-static void emit_load_R32( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movss(cp->func, data, src_ptr);
- sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
-}
-
-
-static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
- struct x86_reg data,
- struct x86_reg src_ptr )
-{
- sse_movss(cp->func, data, src_ptr);
- sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
- sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
- sse2_cvtdq2ps(cp->func, data, data);
- sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
-}
-
-
-
-/* Extended swizzles? Maybe later.
- */
-static void emit_swizzle( struct aos_compilation *cp,
- struct x86_reg dest,
- struct x86_reg src,
- ubyte shuffle )
-{
- sse_shufps(cp->func, dest, src, shuffle);
-}
-
-
-
-static boolean get_buffer_ptr( struct aos_compilation *cp,
- boolean linear,
- unsigned buf_idx,
- struct x86_reg elt,
- struct x86_reg ptr)
-{
- struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- buf_idx * sizeof(struct aos_buffer));
-
- struct x86_reg buf_stride = x86_make_disp(buf,
- Offset(struct aos_buffer, stride));
- if (linear) {
- struct x86_reg buf_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_ptr);
- x86_mov(cp->func, elt, buf_stride);
- x86_add(cp->func, elt, ptr);
- if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
- x86_mov(cp->func, buf_ptr, elt);
- }
- else {
- struct x86_reg buf_base_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, base_ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_stride);
- x86_imul(cp->func, ptr, elt);
- x86_add(cp->func, ptr, buf_base_ptr);
- }
-
- cp->insn_counter++;
-
- return TRUE;
-}
-
-
-static boolean load_input( struct aos_compilation *cp,
- unsigned idx,
- struct x86_reg bufptr )
-{
- unsigned format = cp->vaos->base.key.element[idx].in.format;
- unsigned offset = cp->vaos->base.key.element[idx].in.offset;
- struct x86_reg dataXMM = aos_get_xmm_reg(cp);
-
- /* Figure out source pointer address:
- */
- struct x86_reg src = x86_make_disp(bufptr, offset);
-
- aos_adopt_xmm_reg( cp,
- dataXMM,
- TGSI_FILE_INPUT,
- idx,
- TRUE );
-
- switch (format) {
- case PIPE_FORMAT_R32_FLOAT:
- emit_load_R32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32_FLOAT:
- emit_load_R32G32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32B32_FLOAT:
- emit_load_R32G32B32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_R32G32B32A32_FLOAT:
- emit_load_R32G32B32A32(cp, dataXMM, src);
- break;
- case PIPE_FORMAT_A8R8G8B8_UNORM:
- emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
- emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- break;
- case PIPE_FORMAT_R8G8B8A8_UNORM:
- emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
- break;
- default:
- AOS_ERROR(cp, "unhandled input format");
- return FALSE;
- }
-
- return TRUE;
-}
-
-static boolean load_inputs( struct aos_compilation *cp,
- unsigned buffer,
- struct x86_reg ptr )
-{
- unsigned i;
-
- for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
- if (cp->vaos->base.key.element[i].in.buffer == buffer) {
-
- if (!load_input( cp, i, ptr ))
- return FALSE;
-
- cp->insn_counter++;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
-{
- unsigned i;
- for (i = 0; i < cp->vaos->nr_vb; i++) {
- struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- i * sizeof(struct aos_buffer));
-
- struct x86_reg buf_base_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, base_ptr));
-
- if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
- struct x86_reg ptr = cp->tmp_EAX;
-
- x86_mov(cp->func, ptr, buf_base_ptr);
-
- /* Load all inputs for this constant vertex buffer
- */
- load_inputs( cp, i, x86_deref(ptr) );
-
- /* Then just force them out to aos_machine.input[]
- */
- aos_spill_all( cp );
-
- }
- else if (linear) {
-
- struct x86_reg elt = cp->idx_EBX;
- struct x86_reg ptr = cp->tmp_EAX;
-
- struct x86_reg buf_stride = x86_make_disp(buf,
- Offset(struct aos_buffer, stride));
-
- struct x86_reg buf_ptr = x86_make_disp(buf,
- Offset(struct aos_buffer, ptr));
-
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, ptr, buf_stride);
- x86_imul(cp->func, ptr, elt);
- x86_add(cp->func, ptr, buf_base_ptr);
-
-
- /* In the linear case, keep the buffer pointer instead of the
- * index number.
- */
- if (cp->vaos->nr_vb == 1)
- x86_mov( cp->func, elt, ptr );
- else
- x86_mov( cp->func, buf_ptr, ptr );
-
- cp->insn_counter++;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
-{
- unsigned j;
-
- for (j = 0; j < cp->vaos->nr_vb; j++) {
- if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
- /* just retreive pre-transformed input */
- }
- else if (linear && cp->vaos->nr_vb == 1) {
- load_inputs( cp, 0, cp->idx_EBX );
- }
- else {
- struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
- struct x86_reg ptr = cp->tmp_EAX;
-
- if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
- return FALSE;
-
- if (!load_inputs( cp, j, ptr ))
- return FALSE;
- }
- }
-
- return TRUE;
-}
-
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
-{
- if (linear && cp->vaos->nr_vb == 1) {
- struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
- (0 * sizeof(struct aos_buffer) +
- Offset(struct aos_buffer, stride)));
-
- x86_add(cp->func, cp->idx_EBX, stride);
- sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
- }
- else if (linear) {
- /* Nothing to do */
- }
- else {
- x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
- }
-
- return TRUE;
-}
-
-
-
-
-
-
-static void emit_store_R32G32B32A32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movups(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32G32B32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movlps(cp->func, dst_ptr, dataXMM);
- sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
- sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
-}
-
-static void emit_store_R32G32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movlps(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
- struct x86_reg dst_ptr,
- struct x86_reg dataXMM )
-{
- sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
- sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
- sse2_packssdw(cp->func, dataXMM, dataXMM);
- sse2_packuswb(cp->func, dataXMM, dataXMM);
- sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-
-
-static boolean emit_output( struct aos_compilation *cp,
- struct x86_reg ptr,
- struct x86_reg dataXMM,
- enum attrib_emit format )
-{
- switch (format) {
- case EMIT_1F:
- case EMIT_1F_PSIZE:
- emit_store_R32(cp, ptr, dataXMM);
- break;
- case EMIT_2F:
- emit_store_R32G32(cp, ptr, dataXMM);
- break;
- case EMIT_3F:
- emit_store_R32G32B32(cp, ptr, dataXMM);
- break;
- case EMIT_4F:
- emit_store_R32G32B32A32(cp, ptr, dataXMM);
- break;
- case EMIT_4UB:
- emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
- break;
- case EMIT_4UB_BGRA:
- emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
- emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
- break;
- default:
- AOS_ERROR(cp, "unhandled output format");
- return FALSE;
- }
-
- return TRUE;
-}
-
-
-
-boolean aos_emit_outputs( struct aos_compilation *cp )
-{
- unsigned i;
-
- for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
- enum attrib_emit format = cp->vaos->base.key.element[i].out.format;
- unsigned offset = cp->vaos->base.key.element[i].out.offset;
- unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
-
- struct x86_reg data;
-
- if (format == EMIT_1F_PSIZE) {
- data = aos_get_internal_xmm( cp, IMM_PSIZE );
- }
- else {
- data = aos_get_shader_reg( cp,
- TGSI_FILE_OUTPUT,
- vs_output );
- }
-
- if (data.file != file_XMM) {
- struct x86_reg tmp = aos_get_xmm_reg( cp );
- sse_movaps(cp->func, tmp, data);
- data = tmp;
- }
-
- if (!emit_output( cp,
- x86_make_disp( cp->outbuf_ECX, offset ),
- data,
- format ))
- return FALSE;
-
- aos_release_xmm_reg( cp, data.idx );
-
- cp->insn_counter++;
- }
-
- return TRUE;
-}
-
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
deleted file mode 100644
index 0eda414ee6a..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include "pipe/p_config.h"
-
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#ifdef PIPE_ARCH_X86
-
-#include "rtasm/rtasm_x86sse.h"
-
-
-#define X87_CW_EXCEPTION_INV_OP (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
-#define X87_CW_EXCEPTION_PRECISION (1<<5)
-#define X87_CW_PRECISION_SINGLE (0<<8)
-#define X87_CW_PRECISION_RESERVED (1<<8)
-#define X87_CW_PRECISION_DOUBLE (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
-#define X87_CW_PRECISION_MASK (3<<8)
-#define X87_CW_ROUND_NEAREST (0<<10)
-#define X87_CW_ROUND_DOWN (1<<10)
-#define X87_CW_ROUND_UP (2<<10)
-#define X87_CW_ROUND_ZERO (3<<10)
-#define X87_CW_ROUND_MASK (3<<10)
-#define X87_CW_INFINITY (1<<12)
-
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 0.0F;
- result[3] = 1.0F;
- }
- else
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-static void PIPE_CDECL do_lit_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- if (in[0] > 0)
- {
- if (in[1] <= 0.0)
- {
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = 0.0F;
- result[3] = 1.0F;
- return;
- }
-
- if (machine->lit_info[count].shine_tab->exponent != in[3]) {
- machine->lit_info[count].func = aos_do_lit;
- goto no_luck;
- }
-
- if (in[1] <= 1.0)
- {
- const float *tab = machine->lit_info[count].shine_tab->values;
- float f = in[1] * 256;
- int k = (int)f;
- float frac = f - (float)k;
-
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
- result[3] = 1.0;
- return;
- }
-
- no_luck:
- {
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
- result[0] = 1.0F;
- result[1] = in[0];
- result[2] = powf(in[1], exponent);
- result[3] = 1.0;
- }
- }
- else
- {
- result[0] = 1.0F;
- result[1] = 0.0;
- result[2] = 0.0;
- result[3] = 1.0F;
- }
-}
-
-
-static void do_populate_lut( struct shine_tab *tab,
- float unclamped_exponent )
-{
- const float epsilon = 1.0F / 256.0F;
- float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
- unsigned i;
-
- tab->exponent = unclamped_exponent; /* for later comparison */
-
- tab->values[0] = 0;
- if (exponent == 0) {
- for (i = 1; i < 258; i++) {
- tab->values[i] = 1.0;
- }
- }
- else {
- for (i = 1; i < 258; i++) {
- tab->values[i] = powf((float)i * epsilon, exponent);
- }
- }
-}
-
-
-
-
-static void PIPE_CDECL populate_lut( struct aos_machine *machine,
- float *result,
- const float *in,
- unsigned count )
-{
- unsigned i, tab;
-
- /* Search for an existing table for this value. Note that without
- * static analysis we don't really know if in[3] will be constant,
- * but it usually is...
- */
- for (tab = 0; tab < 4; tab++) {
- if (machine->shine_tab[tab].exponent == in[3]) {
- goto found;
- }
- }
-
- for (tab = 0, i = 1; i < 4; i++) {
- if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
- tab = i;
- }
-
- if (machine->shine_tab[tab].last_used == machine->now) {
- /* No unused tables (this is not a ffvertex program...). Just
- * call pow each time:
- */
- machine->lit_info[count].func = aos_do_lit;
- machine->lit_info[count].func( machine, result, in, count );
- return;
- }
- else {
- do_populate_lut( &machine->shine_tab[tab], in[3] );
- }
-
- found:
- machine->shine_tab[tab].last_used = machine->now;
- machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
- machine->lit_info[count].func = do_lit_lut;
- machine->lit_info[count].func( machine, result, in, count );
-}
-
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants)
-{
- machine->constants[slot] = constants;
-
- {
- unsigned i;
- for (i = 0; i < MAX_LIT_INFO; i++) {
- machine->lit_info[i].func = populate_lut;
- machine->now++;
- }
- }
-}
-
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport )
-{
- memcpy(machine->scale, viewport->scale, 4 * sizeof(float));
- memcpy(machine->translate, viewport->translate, 4 * sizeof(float));
-}
-
-
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
- align_free(machine);
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
- struct aos_machine *machine;
- unsigned i;
- float inv = 1.0f/255.0f;
- float f255 = 255.0f;
-
- machine = align_malloc(sizeof(struct aos_machine), 16);
- if (!machine)
- return NULL;
-
- memset(machine, 0, sizeof(*machine));
-
- ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f);
- *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
-
- ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f);
- ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f);
- ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv);
- ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255);
- ASSIGN_4V(machine->internal[IMM_RSQ], -.5f, 1.5f, 0.0f, 0.0f);
-
-
- machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_NEAREST |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- assert(machine->fpu_rnd_nearest == 0x37f);
-
- machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
- X87_CW_EXCEPTION_DENORM_OP |
- X87_CW_EXCEPTION_ZERO_DIVIDE |
- X87_CW_EXCEPTION_OVERFLOW |
- X87_CW_EXCEPTION_UNDERFLOW |
- X87_CW_EXCEPTION_PRECISION |
- (1<<6) |
- X87_CW_ROUND_DOWN |
- X87_CW_PRECISION_DOUBLE_EXT);
-
- for (i = 0; i < MAX_SHINE_TAB; i++)
- do_populate_lut( &machine->shine_tab[i], 1.0f );
-
- return machine;
-}
-
-#else
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
- const struct pipe_viewport_state *viewport )
-{
-}
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
- unsigned slot,
- const void *constants)
-{
-}
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
- return NULL;
-}
-#endif
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index cf894bbe8af..7fb0e0953e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -185,12 +185,7 @@ draw_create_vs_ppc(struct draw_context *draw,
tgsi_scan_shader(templ->tokens, &vs->base.info);
vs->base.draw = draw;
-#if 0
- if (1)
- vs->base.create_variant = draw_vs_variant_aos_ppc;
- else
-#endif
- vs->base.create_variant = draw_vs_create_variant_generic;
+ vs->base.create_variant = draw_vs_create_variant_generic;
vs->base.prepare = vs_ppc_prepare;
vs->base.run_linear = vs_ppc_run_linear;
vs->base.delete = vs_ppc_delete;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
deleted file mode 100644
index d918579bda4..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
- * Authors:
- * Keith Whitwell <[email protected]>
- * Brian Paul
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_config.h"
-
-#include "draw_vs.h"
-
-#if defined(PIPE_ARCH_X86)
-
-#include "pipe/p_shader_tokens.h"
-
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "rtasm/rtasm_cpu.h"
-#include "rtasm/rtasm_x86sse.h"
-#include "tgsi/tgsi_sse2.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_exec.h"
-
-#define SSE_MAX_VERTICES 4
-
-
-struct draw_sse_vertex_shader {
- struct draw_vertex_shader base;
- struct x86_function sse2_program;
-
- tgsi_sse2_vs_func func;
-
- struct tgsi_exec_machine *machine;
-};
-
-
-static void
-vs_sse_prepare( struct draw_vertex_shader *base,
- struct draw_context *draw )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
- struct tgsi_exec_machine *machine = shader->machine;
-
- machine->Samplers = draw->vs.samplers;
-
- if (base->info.uses_instanceid) {
- unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INSTANCEID];
- assert(i < Elements(machine->SystemValue));
- machine->SystemValue[i][0] = base->draw->instance_id;
- }
-}
-
-
-
-/* Simplified vertex shader interface for the pt paths. Given the
- * complexity of code-generating all the above operations together,
- * it's time to try doing all the other stuff separately.
- */
-static void
-vs_sse_run_linear( struct draw_vertex_shader *base,
- const float (*input)[4],
- float (*output)[4],
- const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
- const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
- unsigned count,
- unsigned input_stride,
- unsigned output_stride )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
- struct tgsi_exec_machine *machine = shader->machine;
- unsigned int i;
-
- /* By default, execute all channels. XXX move this inside the loop
- * below when we support shader conditionals/loops.
- */
- tgsi_set_exec_mask(machine, 1, 1, 1, 1);
-
- for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
- unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
-
- if (max_vertices < 4) {
- /* disable the unused execution channels */
- tgsi_set_exec_mask(machine,
- 1,
- max_vertices > 1,
- max_vertices > 2,
- 0);
- }
-
- /* run compiled shader
- */
- shader->func(machine,
- (const float (*)[4])constants[0],
- shader->base.immediates,
- input,
- base->info.num_inputs,
- input_stride,
- output,
- base->info.num_outputs,
- output_stride );
-
- input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
- output = (float (*)[4])((char *)output + output_stride * max_vertices);
- }
-}
-
-
-
-
-static void
-vs_sse_delete( struct draw_vertex_shader *base )
-{
- struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-
- x86_release_func( &shader->sse2_program );
-
- align_free( (void *) shader->base.immediates );
-
- FREE( (void*) shader->base.state.tokens );
- FREE( shader );
-}
-
-
-struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
- const struct pipe_shader_state *templ)
-{
- struct draw_sse_vertex_shader *vs;
-
- if (!rtasm_cpu_has_sse2())
- return NULL;
-
- vs = CALLOC_STRUCT( draw_sse_vertex_shader );
- if (vs == NULL)
- return NULL;
-
- /* we make a private copy of the tokens */
- vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
- if (!vs->base.state.tokens)
- goto fail;
-
- tgsi_scan_shader(templ->tokens, &vs->base.info);
-
- vs->base.draw = draw;
- if (1)
- vs->base.create_variant = draw_vs_create_variant_aos_sse;
- else
- vs->base.create_variant = draw_vs_create_variant_generic;
- vs->base.prepare = vs_sse_prepare;
- vs->base.run_linear = vs_sse_run_linear;
- vs->base.delete = vs_sse_delete;
-
- vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
- sizeof(float), 16);
-
- vs->machine = draw->vs.machine;
-
- x86_init_func( &vs->sse2_program );
-
- if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
- &vs->sse2_program,
- (float (*)[4])vs->base.immediates,
- TRUE ))
- goto fail;
-
- vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );
- if (!vs->func) {
- goto fail;
- }
-
- return &vs->base;
-
-fail:
- if (0)
- debug_warning("tgsi_emit_sse2() failed, falling back to interpreter\n");
-
- x86_release_func( &vs->sse2_program );
-
- FREE(vs);
- return NULL;
-}
-
-
-
-#else
-
-struct draw_vertex_shader *
-draw_create_vs_sse( struct draw_context *draw,
- const struct pipe_shader_state *templ )
-{
- return (void *) 0;
-}
-
-
-#endif
-
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
deleted file mode 100644
index 5614caf63e7..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ /dev/null
@@ -1,3106 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2009-2010 VMware, Inc. All rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "pipe/p_config.h"
-
-#include "tgsi/tgsi_sse2.h"
-
-#if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */
-
-#include "util/u_debug.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#if defined(PIPE_ARCH_SSE)
-#include "util/u_sse.h"
-#endif
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_exec.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-#define FAST_MATH 1
-
-
-#define FOR_EACH_CHANNEL( CHAN )\
- for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
-
-#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
- ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
-
-#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
- if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
-
-#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
- FOR_EACH_CHANNEL( CHAN )\
- IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
-
-#define CHAN_X 0
-#define CHAN_Y 1
-#define CHAN_Z 2
-#define CHAN_W 3
-
-#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I
-#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C
-
-#define TEMP_R0 TGSI_EXEC_TEMP_R0
-#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
-#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
-#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
-
-
-/**
- * X86 utility functions.
- */
-
-static struct x86_reg
-make_xmm(
- unsigned xmm )
-{
- return x86_make_reg(
- file_XMM,
- (enum x86_reg_name) xmm );
-}
-
-/**
- * X86 register mapping helpers.
- */
-
-static struct x86_reg
-get_const_base( void )
-{
- return x86_make_reg(
- file_REG32,
- reg_AX );
-}
-
-static struct x86_reg
-get_machine_base( void )
-{
- return x86_make_reg(
- file_REG32,
- reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
- /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
- return x86_make_disp(
- get_machine_base(),
- Offset(struct tgsi_exec_machine, Inputs) );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
- /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
- return x86_make_disp(
- get_machine_base(),
- Offset(struct tgsi_exec_machine, Outputs) );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
- return x86_make_disp(
- get_machine_base(),
- Offset(struct tgsi_exec_machine, Temps) );
-}
-
-static struct x86_reg
-get_coef_base( void )
-{
- return x86_make_reg(
- file_REG32,
- reg_BX );
-}
-
-static struct x86_reg
-get_sampler_base( void )
-{
- return x86_make_reg(
- file_REG32,
- reg_DI );
-}
-
-static struct x86_reg
-get_immediate_base( void )
-{
- return x86_make_reg(
- file_REG32,
- reg_DX );
-}
-
-static struct x86_reg
-get_system_value_base( void )
-{
- return x86_make_disp(
- get_machine_base(),
- Offset(struct tgsi_exec_machine, SystemValue) );
-}
-
-
-/**
- * Data access helpers.
- */
-
-
-static struct x86_reg
-get_immediate(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_immediate_base(),
- (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_const_base(),
- (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_sampler_ptr(
- unsigned unit )
-{
- return x86_make_disp(
- get_sampler_base(),
- unit * sizeof( struct tgsi_sampler * ) );
-}
-
-static struct x86_reg
-get_input(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_input_base(),
- (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_output(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_output_base(),
- (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_temp(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_temp_base(),
- (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_system_value(
- unsigned vec,
- unsigned chan )
-{
- return x86_make_disp(
- get_system_value_base(), /* base */
- (vec * 4 + chan) * 4 ); /* byte offset from base */
-}
-
-static struct x86_reg
-get_coef(
- unsigned vec,
- unsigned chan,
- unsigned member )
-{
- return x86_make_disp(
- get_coef_base(),
- ((vec * 3 + member) * 4 + chan) * 4 );
-}
-
-
-static void
-emit_ret(
- struct x86_function *func )
-{
- x86_ret( func );
-}
-
-
-/**
- * Data fetch helpers.
- */
-
-/**
- * Copy a shader constant to xmm register
- * \param xmm the destination xmm register
- * \param vec the src const buffer index
- * \param chan src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_const(
- struct x86_function *func,
- uint xmm,
- int vec,
- uint chan,
- uint indirect,
- uint indirectFile,
- int indirectIndex )
-{
- if (indirect) {
- /* 'vec' is the offset from the address register's value.
- * We're loading CONST[ADDR+vec] into an xmm register.
- */
- struct x86_reg r0 = get_immediate_base();
- struct x86_reg r1 = get_coef_base();
- uint i;
-
- assert( indirectFile == TGSI_FILE_ADDRESS );
- assert( indirectIndex == 0 );
- assert( r0.mod == mod_REG );
- assert( r1.mod == mod_REG );
-
- x86_push( func, r0 );
- x86_push( func, r1 );
-
- /*
- * Loop over the four pixels or vertices in the quad.
- * Get the value of the address (offset) register for pixel/vertex[i],
- * add it to the src offset and index into the constant buffer.
- * Note that we're working on SOA data.
- * If any of the pixel/vertex execution channels are unused their
- * values will be garbage. It's very important that we don't use
- * those garbage values as indexes into the constant buffer since
- * that'll cause segfaults.
- * The solution is to bitwise-AND the offset with the execution mask
- * register whose values are either 0 or ~0.
- * The caller must setup the execution mask register to indicate
- * which channels are valid/alive before running the shader.
- * The execution mask will also figure into loops and conditionals
- * someday.
- */
- for (i = 0; i < QUAD_SIZE; i++) {
- /* r1 = address register[i] */
- x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
- /* r0 = execution mask[i] */
- x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
- /* r1 = r1 & r0 */
- x86_and( func, r1, r0 );
- /* r0 = 'vec', the offset */
- x86_lea( func, r0, get_const( vec, chan ) );
-
- /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
- */
- x86_add( func, r1, r1 );
- x86_add( func, r1, r1 );
- x86_add( func, r1, r1 );
- x86_add( func, r1, r1 );
-
- x86_add( func, r0, r1 ); /* r0 = r0 + r1 */
- x86_mov( func, r1, x86_deref( r0 ) );
- x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
- }
-
- x86_pop( func, r1 );
- x86_pop( func, r0 );
-
- sse_movaps(
- func,
- make_xmm( xmm ),
- get_temp( TEMP_R0, CHAN_X ) );
- }
- else {
- /* 'vec' is the index into the src register file, such as TEMP[vec] */
- assert( vec >= 0 );
-
- sse_movss(
- func,
- make_xmm( xmm ),
- get_const( vec, chan ) );
- sse_shufps(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ),
- SHUF( 0, 0, 0, 0 ) );
- }
-}
-
-static void
-emit_immediate(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movss(
- func,
- make_xmm( xmm ),
- get_immediate( vec, chan ) );
- sse_shufps(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ),
- SHUF( 0, 0, 0, 0 ) );
-}
-
-
-/**
- * Copy a shader input to xmm register
- * \param xmm the destination xmm register
- * \param vec the src input attrib
- * \param chan src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movups(
- func,
- make_xmm( xmm ),
- get_input( vec, chan ) );
-}
-
-/**
- * Store an xmm register to a shader output
- * \param xmm the source xmm register
- * \param vec the dest output attrib
- * \param chan src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movups(
- func,
- get_output( vec, chan ),
- make_xmm( xmm ) );
-}
-
-/**
- * Copy a shader temporary to xmm register
- * \param xmm the destination xmm register
- * \param vec the src temp register
- * \param chan src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_tempf(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movaps(
- func,
- make_xmm( xmm ),
- get_temp( vec, chan ) );
-}
-
-/**
- * Copy a system value to xmm register
- * \param xmm the destination xmm register
- * \param vec the source system value register
- * \param chan src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_system_value(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movss(
- func,
- make_xmm( xmm ),
- get_system_value( vec, chan ) );
- sse_shufps(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ),
- SHUF( 0, 0, 0, 0 ) );
-}
-
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm the destination xmm register
- * \param vec the src input/attribute coefficient index
- * \param chan src channel to fetch (X, Y, Z or W)
- * \param member 0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan,
- unsigned member )
-{
- sse_movss(
- func,
- make_xmm( xmm ),
- get_coef( vec, chan, member ) );
- sse_shufps(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ),
- SHUF( 0, 0, 0, 0 ) );
-}
-
-/**
- * Data store helpers.
- */
-
-static void
-emit_inputs(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movups(
- func,
- get_input( vec, chan ),
- make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- sse_movaps(
- func,
- get_temp( vec, chan ),
- make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- assert( vec == 0 );
-
- emit_temps(
- func,
- xmm,
- vec + TGSI_EXEC_TEMP_ADDR,
- chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- emit_coef(
- func,
- xmm,
- vec,
- chan,
- 0 );
-}
-
-static void
-emit_coef_dadx(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- emit_coef(
- func,
- xmm,
- vec,
- chan,
- 1 );
-}
-
-static void
-emit_coef_dady(
- struct x86_function *func,
- unsigned xmm,
- unsigned vec,
- unsigned chan )
-{
- emit_coef(
- func,
- xmm,
- vec,
- chan,
- 2 );
-}
-
-/**
- * Function call helpers.
- */
-
-/**
- * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
- * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
- * that the stack pointer is 16 byte aligned, as expected.
- */
-static void
-emit_func_call(
- struct x86_function *func,
- unsigned xmm_save_mask,
- const struct x86_reg *arg,
- unsigned nr_args,
- void (PIPE_CDECL *code)() )
-{
- struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
- unsigned i, n;
-
- x86_push(
- func,
- x86_make_reg( file_REG32, reg_AX) );
- x86_push(
- func,
- x86_make_reg( file_REG32, reg_CX) );
- x86_push(
- func,
- x86_make_reg( file_REG32, reg_DX) );
-
- /* Store XMM regs to the stack
- */
- for(i = 0, n = 0; i < 8; ++i)
- if(xmm_save_mask & (1 << i))
- ++n;
-
- x86_sub_imm(
- func,
- x86_make_reg( file_REG32, reg_SP ),
- n*16);
-
- for(i = 0, n = 0; i < 8; ++i)
- if(xmm_save_mask & (1 << i)) {
- sse_movups(
- func,
- x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
- make_xmm( i ) );
- ++n;
- }
-
- for (i = 0; i < nr_args; i++) {
- /* Load the address of the buffer we use for passing arguments and
- * receiving results:
- */
- x86_lea(
- func,
- ecx,
- arg[i] );
-
- /* Push actual function arguments (currently just the pointer to
- * the buffer above), and call the function:
- */
- x86_push( func, ecx );
- }
-
- x86_mov_reg_imm( func, ecx, (unsigned long) code );
- x86_call( func, ecx );
-
- /* Pop the arguments (or just add an immediate to esp)
- */
- for (i = 0; i < nr_args; i++) {
- x86_pop(func, ecx );
- }
-
- /* Pop the saved XMM regs:
- */
- for(i = 0, n = 0; i < 8; ++i)
- if(xmm_save_mask & (1 << i)) {
- sse_movups(
- func,
- make_xmm( i ),
- x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
- ++n;
- }
-
- x86_add_imm(
- func,
- x86_make_reg( file_REG32, reg_SP ),
- n*16);
-
- /* Restore GP registers in a reverse order.
- */
- x86_pop(
- func,
- x86_make_reg( file_REG32, reg_DX) );
- x86_pop(
- func,
- x86_make_reg( file_REG32, reg_CX) );
- x86_pop(
- func,
- x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst_src1(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst,
- unsigned xmm_src0,
- void (PIPE_CDECL *code)() )
-{
- struct x86_reg store = get_temp( TEMP_R0, 0 );
- unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
-
- /* Store our input parameters (in xmm regs) to the buffer we use
- * for passing arguments. We will pass a pointer to this buffer as
- * the actual function argument.
- */
- sse_movaps(
- func,
- store,
- make_xmm( xmm_src0 ) );
-
- emit_func_call( func,
- xmm_mask,
- &store,
- 1,
- code );
-
- sse_movaps(
- func,
- make_xmm( xmm_dst ),
- store );
-}
-
-
-static void
-emit_func_call_dst_src2(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst,
- unsigned xmm_src0,
- unsigned xmm_src1,
- void (PIPE_CDECL *code)() )
-{
- struct x86_reg store = get_temp( TEMP_R0, 0 );
- unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
-
- /* Store two inputs to parameter buffer.
- */
- sse_movaps(
- func,
- store,
- make_xmm( xmm_src0 ) );
-
- sse_movaps(
- func,
- x86_make_disp( store, 4 * sizeof(float) ),
- make_xmm( xmm_src1 ) );
-
-
- /* Emit the call
- */
- emit_func_call( func,
- xmm_mask,
- &store,
- 1,
- code );
-
- /* Retrieve the results:
- */
- sse_movaps(
- func,
- make_xmm( xmm_dst ),
- store );
-}
-
-
-
-
-
-#if defined(PIPE_ARCH_SSE)
-
-/*
- * Fast SSE2 implementation of special math functions.
- */
-
-#define POLY0(x, c0) _mm_set1_ps(c0)
-#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
-#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128
-exp2f4(__m128 x)
-{
- __m128i ipart;
- __m128 fpart, expipart, expfpart;
-
- x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
- x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
-
- /* ipart = int(x - 0.5) */
- ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-
- /* fpart = x - ipart */
- fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-
- /* expipart = (float) (1 << ipart) */
- expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-
- /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
-#if EXP_POLY_DEGREE == 5
- expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-#elif EXP_POLY_DEGREE == 4
- expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
-#elif EXP_POLY_DEGREE == 3
- expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
-#elif EXP_POLY_DEGREE == 2
- expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
-#else
-#error
-#endif
-
- return _mm_mul_ps(expipart, expfpart);
-}
-
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128
-log2f4(__m128 x)
-{
- __m128i expmask = _mm_set1_epi32(0x7f800000);
- __m128i mantmask = _mm_set1_epi32(0x007fffff);
- __m128 one = _mm_set1_ps(1.0f);
-
- __m128i i = _mm_castps_si128(x);
-
- /* exp = (float) exponent(x) */
- __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
-
- /* mant = (float) mantissa(x) */
- __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
-
- __m128 logmant;
-
- /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
- * These coefficients can be generate with
- * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
- */
-#if LOG_POLY_DEGREE == 6
- logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
-#elif LOG_POLY_DEGREE == 5
- logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
-#elif LOG_POLY_DEGREE == 4
- logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-#elif LOG_POLY_DEGREE == 3
- logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
-#else
-#error
-#endif
-
- /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
- logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
-
- return _mm_add_ps(logmant, exp);
-}
-
-
-static INLINE __m128
-powf4(__m128 x, __m128 y)
-{
- return exp2f4(_mm_mul_ps(log2f4(x), y));
-}
-
-#endif /* PIPE_ARCH_SSE */
-
-
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
- struct x86_function *func,
- unsigned xmm )
-{
- sse_andps(
- func,
- make_xmm( xmm ),
- get_temp(
- TGSI_EXEC_TEMP_7FFFFFFF_I,
- TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
- struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src )
-{
- sse_addps(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
- float *store )
-{
- store[0] = cosf( store[0] );
- store[1] = cosf( store[1] );
- store[2] = cosf( store[2] );
- store[3] = cosf( store[3] );
-}
-
-static void
-emit_cos(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- cos4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-ex24f(
- float *store )
-{
-#if defined(PIPE_ARCH_SSE)
- _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
-#else
- store[0] = util_fast_exp2( store[0] );
- store[1] = util_fast_exp2( store[1] );
- store[2] = util_fast_exp2( store[2] );
- store[3] = util_fast_exp2( store[3] );
-#endif
-}
-
-static void
-emit_ex2(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- ex24f );
-}
-
-static void
-emit_f2it(
- struct x86_function *func,
- unsigned xmm )
-{
- sse2_cvttps2dq(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ) );
-}
-
-static void
-emit_i2f(
- struct x86_function *func,
- unsigned xmm )
-{
- sse2_cvtdq2ps(
- func,
- make_xmm( xmm ),
- make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
- float *store )
-{
- store[0] = floorf( store[0] );
- store[1] = floorf( store[1] );
- store[2] = floorf( store[2] );
- store[3] = floorf( store[3] );
-}
-
-static void
-emit_flr(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
- float *store )
-{
- store[0] -= floorf( store[0] );
- store[1] -= floorf( store[1] );
- store[2] -= floorf( store[2] );
- store[3] -= floorf( store[3] );
-}
-
-static void
-emit_frc(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- frc4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-lg24f(
- float *store )
-{
-#if defined(PIPE_ARCH_SSE)
- _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
-#else
- store[0] = util_fast_log2( store[0] );
- store[1] = util_fast_log2( store[1] );
- store[2] = util_fast_log2( store[2] );
- store[3] = util_fast_log2( store[3] );
-#endif
-}
-
-static void
-emit_lg2(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- lg24f );
-}
-
-static void
-emit_MOV(
- struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src )
-{
- sse_movups(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src)
-{
- sse_mulps(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
- struct x86_function *func,
- unsigned xmm )
-{
- sse_xorps(
- func,
- make_xmm( xmm ),
- get_temp(
- TGSI_EXEC_TEMP_80000000_I,
- TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-pow4f(
- float *store )
-{
-#if defined(PIPE_ARCH_SSE)
- _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
-#else
- store[0] = util_fast_pow( store[0], store[4] );
- store[1] = util_fast_pow( store[1], store[5] );
- store[2] = util_fast_pow( store[2], store[6] );
- store[3] = util_fast_pow( store[3], store[7] );
-#endif
-}
-
-static void
-emit_pow(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst,
- unsigned xmm_src0,
- unsigned xmm_src1 )
-{
- emit_func_call_dst_src2(
- func,
- xmm_save,
- xmm_dst,
- xmm_src0,
- xmm_src1,
- pow4f );
-}
-
-static void
-emit_rcp (
- struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src )
-{
- /* On Intel CPUs at least, this is only accurate to 12 bits -- not
- * good enough. Need to either emit a proper divide or use the
- * iterative technique described below in emit_rsqrt().
- */
- sse2_rcpps(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-rnd4f(
- float *store )
-{
- store[0] = floorf( store[0] + 0.5f );
- store[1] = floorf( store[1] + 0.5f );
- store[2] = floorf( store[2] + 0.5f );
- store[3] = floorf( store[3] + 0.5f );
-}
-
-static void
-emit_rnd(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- rnd4f );
-}
-
-static void
-emit_rsqrt(
- struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src )
-{
-#if HIGH_PRECISION
- /* Although rsqrtps() and rcpps() are low precision on some/all SSE
- * implementations, it is possible to improve its precision at
- * fairly low cost, using a newton/raphson step, as below:
- *
- * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
- * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
- *
- * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
- */
- {
- struct x86_reg dst = make_xmm( xmm_dst );
- struct x86_reg src = make_xmm( xmm_src );
- struct x86_reg tmp0 = make_xmm( 2 );
- struct x86_reg tmp1 = make_xmm( 3 );
-
- assert( xmm_dst != xmm_src );
- assert( xmm_dst != 2 && xmm_dst != 3 );
- assert( xmm_src != 2 && xmm_src != 3 );
-
- sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
- sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
- sse_rsqrtps( func, tmp1, src );
- sse_mulps( func, src, tmp1 );
- sse_mulps( func, dst, tmp1 );
- sse_mulps( func, src, tmp1 );
- sse_subps( func, tmp0, src );
- sse_mulps( func, dst, tmp0 );
- }
-#else
- /* On Intel CPUs at least, this is only accurate to 12 bits -- not
- * good enough.
- */
- sse_rsqrtps(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
- struct x86_function *func,
- unsigned xmm )
-{
- sse_orps(
- func,
- make_xmm( xmm ),
- get_temp(
- TGSI_EXEC_TEMP_80000000_I,
- TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sgn4f(
- float *store )
-{
- store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
- store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
- store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
- store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
-}
-
-static void
-emit_sgn(
- struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst )
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- sgn4f );
-}
-
-static void PIPE_CDECL
-sin4f(
- float *store )
-{
- store[0] = sinf( store[0] );
- store[1] = sinf( store[1] );
- store[2] = sinf( store[2] );
- store[3] = sinf( store[3] );
-}
-
-static void
-emit_sin (struct x86_function *func,
- unsigned xmm_save,
- unsigned xmm_dst)
-{
- emit_func_call_dst_src1(
- func,
- xmm_save,
- xmm_dst,
- xmm_dst,
- sin4f );
-}
-
-static void
-emit_sub(
- struct x86_function *func,
- unsigned xmm_dst,
- unsigned xmm_src )
-{
- sse_subps(
- func,
- make_xmm( xmm_dst ),
- make_xmm( xmm_src ) );
-}
-
-/**
- * Register fetch.
- */
-static void
-emit_fetch(
- struct x86_function *func,
- unsigned xmm,
- const struct tgsi_full_src_register *reg,
- const unsigned chan_index )
-{
- unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
-
- switch (swizzle) {
- case TGSI_SWIZZLE_X:
- case TGSI_SWIZZLE_Y:
- case TGSI_SWIZZLE_Z:
- case TGSI_SWIZZLE_W:
- switch (reg->Register.File) {
- case TGSI_FILE_CONSTANT:
- emit_const(
- func,
- xmm,
- reg->Register.Index,
- swizzle,
- reg->Register.Indirect,
- reg->Indirect.File,
- reg->Indirect.Index );
- break;
-
- case TGSI_FILE_IMMEDIATE:
- emit_immediate(
- func,
- xmm,
- reg->Register.Index,
- swizzle );
- break;
-
- case TGSI_FILE_SYSTEM_VALUE:
- emit_system_value(
- func,
- xmm,
- reg->Register.Index,
- swizzle );
- break;
-
- case TGSI_FILE_INPUT:
- emit_inputf(
- func,
- xmm,
- reg->Register.Index,
- swizzle );
- break;
-
- case TGSI_FILE_TEMPORARY:
- emit_tempf(
- func,
- xmm,
- reg->Register.Index,
- swizzle );
- break;
-
- default:
- assert( 0 );
- }
- break;
-
- default:
- assert( 0 );
- }
-
- switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
- case TGSI_UTIL_SIGN_CLEAR:
- emit_abs( func, xmm );
- break;
-
- case TGSI_UTIL_SIGN_SET:
- emit_setsign( func, xmm );
- break;
-
- case TGSI_UTIL_SIGN_TOGGLE:
- emit_neg( func, xmm );
- break;
-
- case TGSI_UTIL_SIGN_KEEP:
- break;
- }
-}
-
-#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
- emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
-
-/**
- * Register store.
- */
-static void
-emit_store(
- struct x86_function *func,
- unsigned xmm,
- const struct tgsi_full_dst_register *reg,
- const struct tgsi_full_instruction *inst,
- unsigned chan_index )
-{
- switch( inst->Instruction.Saturate ) {
- case TGSI_SAT_NONE:
- break;
-
- case TGSI_SAT_ZERO_ONE:
- sse_maxps(
- func,
- make_xmm( xmm ),
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ) );
-
- sse_minps(
- func,
- make_xmm( xmm ),
- get_temp(
- TGSI_EXEC_TEMP_ONE_I,
- TGSI_EXEC_TEMP_ONE_C ) );
- break;
-
- case TGSI_SAT_MINUS_PLUS_ONE:
- assert( 0 );
- break;
- }
-
-
- switch( reg->Register.File ) {
- case TGSI_FILE_OUTPUT:
- emit_output(
- func,
- xmm,
- reg->Register.Index,
- chan_index );
- break;
-
- case TGSI_FILE_TEMPORARY:
- emit_temps(
- func,
- xmm,
- reg->Register.Index,
- chan_index );
- break;
-
- case TGSI_FILE_ADDRESS:
- emit_addrs(
- func,
- xmm,
- reg->Register.Index,
- chan_index );
- break;
-
- default:
- assert( 0 );
- }
-}
-
-#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
- emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
-
-
-static void PIPE_CDECL
-fetch_texel( struct tgsi_sampler **sampler,
- float *store )
-{
-#if 0
- uint j;
-
- debug_printf("%s sampler: %p (%p) store: %p\n",
- __FUNCTION__,
- sampler, *sampler,
- store );
-
- for (j = 0; j < 4; j++)
- debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
- j,
- store[0+j],
- store[4+j],
- store[8 + j],
- store[12 + j]);
-#endif
-
- {
- float rgba[NUM_CHANNELS][QUAD_SIZE];
- (*sampler)->get_samples(*sampler,
- &store[0], /* s */
- &store[4], /* t */
- &store[8], /* r */
- &store[12], /* lodbias */
- tgsi_sampler_lod_bias,
- rgba); /* results */
-
- memcpy( store, rgba, 16 * sizeof(float));
- }
-
-#if 0
- for (j = 0; j < 4; j++)
- debug_printf("sample %d result %f %f %f %f\n",
- j,
- store[0+j],
- store[4+j],
- store[8+j],
- store[12+j]);
-#endif
-}
-
-/**
- * High-level instruction translators.
- */
-static void
-emit_tex( struct x86_function *func,
- const struct tgsi_full_instruction *inst,
- boolean lodbias,
- boolean projected)
-{
- const uint unit = inst->Src[1].Register.Index;
- struct x86_reg args[2];
- unsigned count;
- unsigned i;
-
- assert(inst->Instruction.Texture);
- switch (inst->Texture.Texture) {
- case TGSI_TEXTURE_1D:
- count = 1;
- break;
- case TGSI_TEXTURE_2D:
- case TGSI_TEXTURE_RECT:
- case TGSI_TEXTURE_1D_ARRAY:
- count = 2;
- break;
- case TGSI_TEXTURE_SHADOW1D:
- case TGSI_TEXTURE_SHADOW2D:
- case TGSI_TEXTURE_SHADOWRECT:
- case TGSI_TEXTURE_3D:
- case TGSI_TEXTURE_CUBE:
- case TGSI_TEXTURE_2D_ARRAY:
- case TGSI_TEXTURE_SHADOW1D_ARRAY:
- count = 3;
- break;
- case TGSI_TEXTURE_SHADOW2D_ARRAY:
- count = 4;
- break;
- default:
- assert(0);
- return;
- }
-
- if (lodbias) {
- FETCH( func, *inst, 3, 0, 3 );
- }
- else {
- emit_tempf(
- func,
- 3,
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C );
-
- }
-
- /* store lodbias whether enabled or not -- fetch_texel currently
- * respects it always.
- */
- sse_movaps( func,
- get_temp( TEMP_R0, 3 ),
- make_xmm( 3 ) );
-
- if (projected) {
- FETCH( func, *inst, 3, 0, 3 );
-
- emit_rcp( func, 3, 3 );
- }
-
- for (i = 0; i < count; i++) {
- FETCH( func, *inst, i, 0, i );
-
- if (projected) {
- sse_mulps(
- func,
- make_xmm( i ),
- make_xmm( 3 ) );
- }
-
- /* Store in the argument buffer:
- */
- sse_movaps(
- func,
- get_temp( TEMP_R0, i ),
- make_xmm( i ) );
- }
-
- args[0] = get_temp( TEMP_R0, 0 );
- args[1] = get_sampler_ptr( unit );
-
- emit_func_call( func,
- 0,
- args,
- Elements(args),
- fetch_texel );
-
- /* If all four channels are enabled, could use a pointer to
- * dst[0].x instead of TEMP_R0 for store?
- */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
-
- sse_movaps(
- func,
- make_xmm( 0 ),
- get_temp( TEMP_R0, i ) );
-
- STORE( func, *inst, 0, 0, i );
- }
-}
-
-
-static void
-emit_kil(
- struct x86_function *func,
- const struct tgsi_full_src_register *reg )
-{
- unsigned uniquemask;
- unsigned unique_count = 0;
- unsigned chan_index;
- unsigned i;
-
- /* This mask stores component bits that were already tested. Note that
- * we test if the value is less than zero, so 1.0 and 0.0 need not to be
- * tested.
- */
- uniquemask = 0;
-
- FOR_EACH_CHANNEL( chan_index ) {
- unsigned swizzle;
-
- /* unswizzle channel */
- swizzle = tgsi_util_get_full_src_register_swizzle(
- reg,
- chan_index );
-
- /* check if the component has not been already tested */
- if( !(uniquemask & (1 << swizzle)) ) {
- uniquemask |= 1 << swizzle;
-
- /* allocate register */
- emit_fetch(
- func,
- unique_count++,
- reg,
- chan_index );
- }
- }
-
- x86_push(
- func,
- x86_make_reg( file_REG32, reg_AX ) );
- x86_push(
- func,
- x86_make_reg( file_REG32, reg_DX ) );
-
- for (i = 0 ; i < unique_count; i++ ) {
- struct x86_reg dataXMM = make_xmm(i);
-
- sse_cmpps(
- func,
- dataXMM,
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ),
- cc_LessThan );
-
- if( i == 0 ) {
- sse_movmskps(
- func,
- x86_make_reg( file_REG32, reg_AX ),
- dataXMM );
- }
- else {
- sse_movmskps(
- func,
- x86_make_reg( file_REG32, reg_DX ),
- dataXMM );
- x86_or(
- func,
- x86_make_reg( file_REG32, reg_AX ),
- x86_make_reg( file_REG32, reg_DX ) );
- }
- }
-
- x86_or(
- func,
- get_temp(
- TGSI_EXEC_TEMP_KILMASK_I,
- TGSI_EXEC_TEMP_KILMASK_C ),
- x86_make_reg( file_REG32, reg_AX ) );
-
- x86_pop(
- func,
- x86_make_reg( file_REG32, reg_DX ) );
- x86_pop(
- func,
- x86_make_reg( file_REG32, reg_AX ) );
-}
-
-
-static void
-emit_kilp(
- struct x86_function *func )
-{
- /* XXX todo / fix me */
-}
-
-
-static void
-emit_setcc(
- struct x86_function *func,
- struct tgsi_full_instruction *inst,
- enum sse_cc cc )
-{
- unsigned chan_index;
-
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- sse_cmpps(
- func,
- make_xmm( 0 ),
- make_xmm( 1 ),
- cc );
- sse_andps(
- func,
- make_xmm( 0 ),
- get_temp(
- TEMP_ONE_I,
- TEMP_ONE_C ) );
- STORE( func, *inst, 0, 0, chan_index );
- }
-}
-
-static void
-emit_cmp(
- struct x86_function *func,
- struct tgsi_full_instruction *inst )
-{
- unsigned chan_index;
-
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- FETCH( func, *inst, 2, 2, chan_index );
- sse_cmpps(
- func,
- make_xmm( 0 ),
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ),
- cc_LessThan );
- sse_andps(
- func,
- make_xmm( 1 ),
- make_xmm( 0 ) );
- sse_andnps(
- func,
- make_xmm( 0 ),
- make_xmm( 2 ) );
- sse_orps(
- func,
- make_xmm( 0 ),
- make_xmm( 1 ) );
- STORE( func, *inst, 0, 0, chan_index );
- }
-}
-
-
-/**
- * Check if inst src/dest regs use indirect addressing into temporary,
- * input or output register files.
- */
-static boolean
-indirect_reg_reference(const struct tgsi_full_instruction *inst)
-{
- uint i;
- for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
- const struct tgsi_full_src_register *reg = &inst->Src[i];
- if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
- reg->Register.File == TGSI_FILE_INPUT ||
- reg->Register.File == TGSI_FILE_OUTPUT) &&
- reg->Register.Indirect)
- return TRUE;
- }
- for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
- const struct tgsi_full_dst_register *reg = &inst->Dst[i];
- if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
- reg->Register.File == TGSI_FILE_INPUT ||
- reg->Register.File == TGSI_FILE_OUTPUT) &&
- reg->Register.Indirect)
- return TRUE;
- }
- return FALSE;
-}
-
-
-static int
-emit_instruction(
- struct x86_function *func,
- struct tgsi_full_instruction *inst )
-{
- unsigned chan_index;
-
- /* we can't handle indirect addressing into temp register file yet */
- if (indirect_reg_reference(inst))
- return FALSE;
-
- switch (inst->Instruction.Opcode) {
- case TGSI_OPCODE_ARL:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_flr(func, 0, 0);
- emit_f2it( func, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_MOV:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 4 + chan_index, 0, chan_index );
- }
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 4 + chan_index, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_LIT:
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
- emit_tempf(
- func,
- 0,
- TEMP_ONE_I,
- TEMP_ONE_C);
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
- STORE( func, *inst, 0, 0, CHAN_X );
- }
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- }
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
- FETCH( func, *inst, 0, 0, CHAN_X );
- sse_maxps(
- func,
- make_xmm( 0 ),
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ) );
- STORE( func, *inst, 0, 0, CHAN_Y );
- }
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
- /* XMM[1] = SrcReg[0].yyyy */
- FETCH( func, *inst, 1, 0, CHAN_Y );
- /* XMM[1] = max(XMM[1], 0) */
- sse_maxps(
- func,
- make_xmm( 1 ),
- get_temp(
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C ) );
- /* XMM[2] = SrcReg[0].wwww */
- FETCH( func, *inst, 2, 0, CHAN_W );
- /* XMM[2] = min(XMM[2], 128.0) */
- sse_minps(
- func,
- make_xmm( 2 ),
- get_temp(
- TGSI_EXEC_TEMP_128_I,
- TGSI_EXEC_TEMP_128_C ) );
- /* XMM[2] = max(XMM[2], -128.0) */
- sse_maxps(
- func,
- make_xmm( 2 ),
- get_temp(
- TGSI_EXEC_TEMP_MINUS_128_I,
- TGSI_EXEC_TEMP_MINUS_128_C ) );
- emit_pow( func, 3, 1, 1, 2 );
- FETCH( func, *inst, 0, 0, CHAN_X );
- sse_xorps(
- func,
- make_xmm( 2 ),
- make_xmm( 2 ) );
- sse_cmpps(
- func,
- make_xmm( 2 ),
- make_xmm( 0 ),
- cc_LessThan );
- sse_andps(
- func,
- make_xmm( 2 ),
- make_xmm( 1 ) );
- STORE( func, *inst, 2, 0, CHAN_Z );
- }
- }
- break;
-
- case TGSI_OPCODE_RCP:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_rcp( func, 0, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_RSQ:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_abs( func, 0 );
- emit_rsqrt( func, 1, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 1, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_EXP:
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- FETCH( func, *inst, 0, 0, CHAN_X );
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_MOV( func, 1, 0 );
- emit_flr( func, 2, 1 );
- /* dst.x = ex2(floor(src.x)) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
- emit_MOV( func, 2, 1 );
- emit_ex2( func, 3, 2 );
- STORE( func, *inst, 2, 0, CHAN_X );
- }
- /* dst.y = src.x - floor(src.x) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_MOV( func, 2, 0 );
- emit_sub( func, 2, 1 );
- STORE( func, *inst, 2, 0, CHAN_Y );
- }
- }
- /* dst.z = ex2(src.x) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- emit_ex2( func, 3, 0 );
- STORE( func, *inst, 0, 0, CHAN_Z );
- }
- }
- /* dst.w = 1.0 */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
- emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- break;
-
- case TGSI_OPCODE_LOG:
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_abs( func, 0 );
- emit_MOV( func, 1, 0 );
- emit_lg2( func, 2, 1 );
- /* dst.z = lg2(abs(src.x)) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
- STORE( func, *inst, 1, 0, CHAN_Z );
- }
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_flr( func, 2, 1 );
- /* dst.x = floor(lg2(abs(src.x))) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
- STORE( func, *inst, 1, 0, CHAN_X );
- }
- /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
- emit_ex2( func, 2, 1 );
- emit_rcp( func, 1, 1 );
- emit_mul( func, 0, 1 );
- STORE( func, *inst, 0, 0, CHAN_Y );
- }
- }
- }
- /* dst.w = 1.0 */
- if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
- emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- break;
-
- case TGSI_OPCODE_MUL:
- /* do all fetches and adds, storing results in temp regs */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- int r = chan_index + 1;
- FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
- FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
- emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */
- }
- /* do all stores of the temp regs */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- int r = chan_index + 1;
- STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
- }
- break;
-
- case TGSI_OPCODE_ADD:
- /* do all fetches and adds, storing results in temp regs */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- int r = chan_index + 1;
- FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
- FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
- emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */
- }
- /* do all stores of the temp regs */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- int r = chan_index + 1;
- STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
- }
- break;
-
- case TGSI_OPCODE_DP3:
- FETCH( func, *inst, 0, 0, CHAN_X );
- FETCH( func, *inst, 1, 1, CHAN_X );
- emit_mul( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Y );
- FETCH( func, *inst, 2, 1, CHAN_Y );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Z );
- FETCH( func, *inst, 2, 1, CHAN_Z );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_DP4:
- FETCH( func, *inst, 0, 0, CHAN_X );
- FETCH( func, *inst, 1, 1, CHAN_X );
- emit_mul( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Y );
- FETCH( func, *inst, 2, 1, CHAN_Y );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Z );
- FETCH( func, *inst, 2, 1, CHAN_Z );
- emit_mul(func, 1, 2 );
- emit_add(func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_W );
- FETCH( func, *inst, 2, 1, CHAN_W );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_DST:
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
- emit_tempf(
- func,
- 0,
- TEMP_ONE_I,
- TEMP_ONE_C );
- STORE( func, *inst, 0, 0, CHAN_X );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
- FETCH( func, *inst, 0, 0, CHAN_Y );
- FETCH( func, *inst, 1, 1, CHAN_Y );
- emit_mul( func, 0, 1 );
- STORE( func, *inst, 0, 0, CHAN_Y );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
- FETCH( func, *inst, 0, 0, CHAN_Z );
- STORE( func, *inst, 0, 0, CHAN_Z );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
- FETCH( func, *inst, 0, 1, CHAN_W );
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- break;
-
- case TGSI_OPCODE_MIN:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- sse_minps(
- func,
- make_xmm( 0 ),
- make_xmm( 1 ) );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_MAX:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- sse_maxps(
- func,
- make_xmm( 0 ),
- make_xmm( 1 ) );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_SLT:
- emit_setcc( func, inst, cc_LessThan );
- break;
-
- case TGSI_OPCODE_SGE:
- emit_setcc( func, inst, cc_NotLessThan );
- break;
-
- case TGSI_OPCODE_MAD:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- FETCH( func, *inst, 2, 2, chan_index );
- emit_mul( func, 0, 1 );
- emit_add( func, 0, 2 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_SUB:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- emit_sub( func, 0, 1 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_LRP:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- FETCH( func, *inst, 1, 1, chan_index );
- FETCH( func, *inst, 2, 2, chan_index );
- emit_sub( func, 1, 2 );
- emit_mul( func, 0, 1 );
- emit_add( func, 0, 2 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_CND:
- return 0;
- break;
-
- case TGSI_OPCODE_DP2A:
- FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
- FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
- emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
- FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
- FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
- emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
- emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
- FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */
- emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
- }
- break;
-
- case TGSI_OPCODE_FRC:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_frc( func, 0, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_CLAMP:
- return 0;
- break;
-
- case TGSI_OPCODE_FLR:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_flr( func, 0, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_ROUND:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_rnd( func, 0, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_EX2:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_ex2( func, 0, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_LG2:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_lg2( func, 0, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_POW:
- FETCH( func, *inst, 0, 0, CHAN_X );
- FETCH( func, *inst, 1, 1, CHAN_X );
- emit_pow( func, 0, 0, 0, 1 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_XPD:
- /* Note: we do all stores after all operands have been fetched
- * to avoid src/dst register aliasing issues for an instruction
- * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2];
- */
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
- FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
- FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
- }
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
- FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
- FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
- emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */
- emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */
- emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */
- emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
- emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */
- /* store xmm[7] in dst.x below */
- }
- if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
- IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
- FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
- FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
- emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */
- emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */
- emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */
- /* store xmm[3] in dst.y below */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
- emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */
- emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */
- emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */
- STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
- STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
- STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
- emit_tempf(
- func,
- 0,
- TEMP_ONE_I,
- TEMP_ONE_C );
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- break;
-
- case TGSI_OPCODE_ABS:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_abs( func, 0) ;
-
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_RCC:
- return 0;
- break;
-
- case TGSI_OPCODE_DPH:
- FETCH( func, *inst, 0, 0, CHAN_X );
- FETCH( func, *inst, 1, 1, CHAN_X );
- emit_mul( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Y );
- FETCH( func, *inst, 2, 1, CHAN_Y );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FETCH( func, *inst, 1, 0, CHAN_Z );
- FETCH( func, *inst, 2, 1, CHAN_Z );
- emit_mul( func, 1, 2 );
- emit_add( func, 0, 1 );
- FETCH( func, *inst, 1, 1, CHAN_W );
- emit_add( func, 0, 1 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_COS:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_DDX:
- return 0;
- break;
-
- case TGSI_OPCODE_DDY:
- return 0;
- break;
-
- case TGSI_OPCODE_KILP:
- /* predicated kill */
- emit_kilp( func );
- return 0; /* XXX fix me */
- break;
-
- case TGSI_OPCODE_KIL:
- /* conditional kill */
- emit_kil( func, &inst->Src[0] );
- break;
-
- case TGSI_OPCODE_PK2H:
- return 0;
- break;
-
- case TGSI_OPCODE_PK2US:
- return 0;
- break;
-
- case TGSI_OPCODE_PK4B:
- return 0;
- break;
-
- case TGSI_OPCODE_PK4UB:
- return 0;
- break;
-
- case TGSI_OPCODE_RFL:
- return 0;
- break;
-
- case TGSI_OPCODE_SEQ:
- emit_setcc( func, inst, cc_Equal );
- break;
-
- case TGSI_OPCODE_SFL:
- return 0;
- break;
-
- case TGSI_OPCODE_SGT:
- emit_setcc( func, inst, cc_NotLessThanEqual );
- break;
-
- case TGSI_OPCODE_SIN:
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0, 0 );
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_SLE:
- emit_setcc( func, inst, cc_LessThanEqual );
- break;
-
- case TGSI_OPCODE_SNE:
- emit_setcc( func, inst, cc_NotEqual );
- break;
-
- case TGSI_OPCODE_STR:
- return 0;
- break;
-
- case TGSI_OPCODE_TEX:
- emit_tex( func, inst, FALSE, FALSE );
- break;
-
- case TGSI_OPCODE_TXD:
- return 0;
- break;
-
- case TGSI_OPCODE_UP2H:
- return 0;
- break;
-
- case TGSI_OPCODE_UP2US:
- return 0;
- break;
-
- case TGSI_OPCODE_UP4B:
- return 0;
- break;
-
- case TGSI_OPCODE_UP4UB:
- return 0;
- break;
-
- case TGSI_OPCODE_X2D:
- return 0;
- break;
-
- case TGSI_OPCODE_ARA:
- return 0;
- break;
-
- case TGSI_OPCODE_ARR:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_rnd( func, 0, 0 );
- emit_f2it( func, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_BRA:
- return 0;
- break;
-
- case TGSI_OPCODE_CAL:
- return 0;
- break;
-
- case TGSI_OPCODE_RET:
- emit_ret( func );
- break;
-
- case TGSI_OPCODE_END:
- break;
-
- case TGSI_OPCODE_SSG:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_sgn( func, 0, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_CMP:
- emit_cmp (func, inst);
- break;
-
- case TGSI_OPCODE_SCS:
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_cos( func, 0, 0 );
- STORE( func, *inst, 0, 0, CHAN_X );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
- FETCH( func, *inst, 0, 0, CHAN_X );
- emit_sin( func, 0, 0 );
- STORE( func, *inst, 0, 0, CHAN_Y );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
- emit_tempf(
- func,
- 0,
- TGSI_EXEC_TEMP_00000000_I,
- TGSI_EXEC_TEMP_00000000_C );
- STORE( func, *inst, 0, 0, CHAN_Z );
- }
- IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
- emit_tempf(
- func,
- 0,
- TEMP_ONE_I,
- TEMP_ONE_C );
- STORE( func, *inst, 0, 0, CHAN_W );
- }
- break;
-
- case TGSI_OPCODE_TXB:
- emit_tex( func, inst, TRUE, FALSE );
- break;
-
- case TGSI_OPCODE_NRM:
- /* fall-through */
- case TGSI_OPCODE_NRM4:
- /* 3 or 4-component normalization */
- {
- uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
-
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
- IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
- IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
- (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
-
- /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
-
- /* xmm4 = src.x */
- /* xmm0 = src.x * src.x */
- FETCH(func, *inst, 0, 0, CHAN_X);
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
- emit_MOV(func, 4, 0);
- }
- emit_mul(func, 0, 0);
-
- /* xmm5 = src.y */
- /* xmm0 = xmm0 + src.y * src.y */
- FETCH(func, *inst, 1, 0, CHAN_Y);
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
- emit_MOV(func, 5, 1);
- }
- emit_mul(func, 1, 1);
- emit_add(func, 0, 1);
-
- /* xmm6 = src.z */
- /* xmm0 = xmm0 + src.z * src.z */
- FETCH(func, *inst, 1, 0, CHAN_Z);
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
- emit_MOV(func, 6, 1);
- }
- emit_mul(func, 1, 1);
- emit_add(func, 0, 1);
-
- if (dims == 4) {
- /* xmm7 = src.w */
- /* xmm0 = xmm0 + src.w * src.w */
- FETCH(func, *inst, 1, 0, CHAN_W);
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
- emit_MOV(func, 7, 1);
- }
- emit_mul(func, 1, 1);
- emit_add(func, 0, 1);
- }
-
- /* xmm1 = 1 / sqrt(xmm0) */
- emit_rsqrt(func, 1, 0);
-
- /* dst.x = xmm1 * src.x */
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
- emit_mul(func, 4, 1);
- STORE(func, *inst, 4, 0, CHAN_X);
- }
-
- /* dst.y = xmm1 * src.y */
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
- emit_mul(func, 5, 1);
- STORE(func, *inst, 5, 0, CHAN_Y);
- }
-
- /* dst.z = xmm1 * src.z */
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
- emit_mul(func, 6, 1);
- STORE(func, *inst, 6, 0, CHAN_Z);
- }
-
- /* dst.w = xmm1 * src.w */
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
- emit_mul(func, 7, 1);
- STORE(func, *inst, 7, 0, CHAN_W);
- }
- }
-
- /* dst0.w = 1.0 */
- if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
- emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
- STORE(func, *inst, 0, 0, CHAN_W);
- }
- }
- break;
-
- case TGSI_OPCODE_DIV:
- return 0;
- break;
-
- case TGSI_OPCODE_DP2:
- FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */
- FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */
- emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */
- FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */
- FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */
- emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */
- emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */
- }
- break;
-
- case TGSI_OPCODE_TXL:
- return 0;
- break;
-
- case TGSI_OPCODE_TXP:
- emit_tex( func, inst, FALSE, TRUE );
- break;
-
- case TGSI_OPCODE_BRK:
- return 0;
- break;
-
- case TGSI_OPCODE_IF:
- return 0;
- break;
-
- case TGSI_OPCODE_ELSE:
- return 0;
- break;
-
- case TGSI_OPCODE_ENDIF:
- return 0;
- break;
-
- case TGSI_OPCODE_PUSHA:
- return 0;
- break;
-
- case TGSI_OPCODE_POPA:
- return 0;
- break;
-
- case TGSI_OPCODE_CEIL:
- return 0;
- break;
-
- case TGSI_OPCODE_I2F:
- return 0;
- break;
-
- case TGSI_OPCODE_NOT:
- return 0;
- break;
-
- case TGSI_OPCODE_TRUNC:
- FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- FETCH( func, *inst, 0, 0, chan_index );
- emit_f2it( func, 0 );
- emit_i2f( func, 0 );
- STORE( func, *inst, 0, 0, chan_index );
- }
- break;
-
- case TGSI_OPCODE_SHL:
- return 0;
- break;
-
- case TGSI_OPCODE_ISHR:
- return 0;
- break;
-
- case TGSI_OPCODE_AND:
- return 0;
- break;
-
- case TGSI_OPCODE_OR:
- return 0;
- break;
-
- case TGSI_OPCODE_MOD:
- return 0;
- break;
-
- case TGSI_OPCODE_XOR:
- return 0;
- break;
-
- case TGSI_OPCODE_SAD:
- return 0;
- break;
-
- case TGSI_OPCODE_TXF:
- return 0;
- break;
-
- case TGSI_OPCODE_TXQ:
- return 0;
- break;
-
- case TGSI_OPCODE_CONT:
- return 0;
- break;
-
- case TGSI_OPCODE_EMIT:
- return 0;
- break;
-
- case TGSI_OPCODE_ENDPRIM:
- return 0;
- break;
-
- default:
- return 0;
- }
-
- return 1;
-}
-
-static void
-emit_declaration(
- struct x86_function *func,
- struct tgsi_full_declaration *decl )
-{
- if( decl->Declaration.File == TGSI_FILE_INPUT ) {
- unsigned first, last, mask;
- unsigned i, j;
-
- first = decl->Range.First;
- last = decl->Range.Last;
- mask = decl->Declaration.UsageMask;
-
- for( i = first; i <= last; i++ ) {
- for( j = 0; j < NUM_CHANNELS; j++ ) {
- if( mask & (1 << j) ) {
- switch( decl->Declaration.Interpolate ) {
- case TGSI_INTERPOLATE_CONSTANT:
- emit_coef_a0( func, 0, i, j );
- emit_inputs( func, 0, i, j );
- break;
-
- case TGSI_INTERPOLATE_LINEAR:
- emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
- emit_coef_dadx( func, 1, i, j );
- emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
- emit_coef_dady( func, 3, i, j );
- emit_mul( func, 0, 1 ); /* x * dadx */
- emit_coef_a0( func, 4, i, j );
- emit_mul( func, 2, 3 ); /* y * dady */
- emit_add( func, 0, 4 ); /* x * dadx + a0 */
- emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
- emit_inputs( func, 0, i, j );
- break;
-
- case TGSI_INTERPOLATE_PERSPECTIVE:
- emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
- emit_coef_dadx( func, 1, i, j );
- emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
- emit_coef_dady( func, 3, i, j );
- emit_mul( func, 0, 1 ); /* x * dadx */
- emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
- emit_coef_a0( func, 5, i, j );
- emit_rcp( func, 4, 4 ); /* 1.0 / w */
- emit_mul( func, 2, 3 ); /* y * dady */
- emit_add( func, 0, 5 ); /* x * dadx + a0 */
- emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
- emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
- emit_inputs( func, 0, i, j );
- break;
-
- default:
- assert( 0 );
- break;
- }
- }
- }
- }
- }
-}
-
-static void aos_to_soa( struct x86_function *func,
- uint arg_aos,
- uint arg_machine,
- uint arg_num,
- uint arg_stride )
-{
- struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
- struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
- struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
- struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
- int loop_top, loop_exit_fixup;
-
- /* Save EBX */
- x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
- x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
- x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
- /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
- x86_lea( func, soa_input,
- x86_make_disp( soa_input,
- Offset(struct tgsi_exec_machine, Inputs) ) );
- x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
- x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
-
- /* while (num_inputs != 0) */
- loop_top = x86_get_label( func );
- x86_cmp_imm( func, num_inputs, 0 );
- loop_exit_fixup = x86_jcc_forward( func, cc_E );
-
- {
- x86_push( func, aos_input );
- sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
- sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
- x86_add( func, aos_input, stride );
- sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
- sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
- x86_add( func, aos_input, stride );
- sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
- sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
- x86_add( func, aos_input, stride );
- sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
- sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
- x86_pop( func, aos_input );
-
- sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
- sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
- sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
- sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
- sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
- sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
- sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
- sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
- sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
- sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
- /* Advance to next input */
- x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
- x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
- }
- /* --num_inputs */
- x86_dec( func, num_inputs );
- x86_jmp( func, loop_top );
- x86_fixup_fwd_jump( func, loop_exit_fixup );
-
- /* Restore EBX */
- x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-}
-
-static void soa_to_aos( struct x86_function *func,
- uint arg_aos,
- uint arg_machine,
- uint arg_num,
- uint arg_stride )
-{
- struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
- struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
- struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
- struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
- int inner_loop;
-
- /* Save EBX */
- x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
- x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
- x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
- /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
- x86_lea( func, soa_output,
- x86_make_disp( soa_output,
- Offset(struct tgsi_exec_machine, Outputs) ) );
- x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
-
- /* do */
- inner_loop = x86_get_label( func );
- {
- sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
- sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
- sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
- sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
- sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
- sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
- sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
- sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
- sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
- sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
- x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
- x86_push( func, aos_output );
- sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
- sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
- x86_add( func, aos_output, temp );
- sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
- sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
- x86_add( func, aos_output, temp );
- sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
- sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
- x86_add( func, aos_output, temp );
- sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
- sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
- x86_pop( func, aos_output );
-
- /* Advance to next output */
- x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
- x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
- }
- /* while --num_outputs */
- x86_dec( func, num_outputs );
- x86_jcc( func, cc_NE, inner_loop );
-
- /* Restore EBX */
- x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-}
-
-
-/**
- * Check if the instructions dst register is the same as any src
- * register and warn if there's a posible SOA dependency.
- */
-static boolean
-check_soa_dependencies(const struct tgsi_full_instruction *inst)
-{
- uint opcode = inst->Instruction.Opcode;
-
- /* XXX: we only handle src/dst aliasing in a few opcodes currently.
- * Need to use an additional temporay to hold the result in the
- * cases where the code is too opaque to fix.
- */
-
- switch (opcode) {
- case TGSI_OPCODE_ADD:
- case TGSI_OPCODE_MOV:
- case TGSI_OPCODE_MUL:
- case TGSI_OPCODE_RCP:
- case TGSI_OPCODE_RSQ:
- case TGSI_OPCODE_EXP:
- case TGSI_OPCODE_LOG:
- case TGSI_OPCODE_DP3:
- case TGSI_OPCODE_DP4:
- case TGSI_OPCODE_DP2A:
- case TGSI_OPCODE_EX2:
- case TGSI_OPCODE_LG2:
- case TGSI_OPCODE_POW:
- case TGSI_OPCODE_XPD:
- case TGSI_OPCODE_DPH:
- case TGSI_OPCODE_COS:
- case TGSI_OPCODE_SIN:
- case TGSI_OPCODE_TEX:
- case TGSI_OPCODE_TXB:
- case TGSI_OPCODE_TXP:
- case TGSI_OPCODE_NRM:
- case TGSI_OPCODE_NRM4:
- case TGSI_OPCODE_DP2:
- /* OK - these opcodes correctly handle SOA dependencies */
- return TRUE;
- default:
- if (!tgsi_check_soa_dependencies(inst))
- return TRUE;
-
- debug_printf("Warning: src/dst aliasing in instruction"
- " is not handled:\n");
- debug_printf("Warning: ");
- tgsi_dump_instruction(inst, 1);
-
- return FALSE;
- }
-}
-
-
-/**
- * Translate a TGSI vertex/fragment shader to SSE2 code.
- * Slightly different things are done for vertex vs. fragment shaders.
- *
- * \param tokens the TGSI input shader
- * \param func the output SSE code/function
- * \param immediates buffer to place immediates, later passed to SSE func
- * \param return 1 for success, 0 if translation failed
- */
-unsigned
-tgsi_emit_sse2(
- const struct tgsi_token *tokens,
- struct x86_function *func,
- float (*immediates)[4],
- boolean do_swizzles )
-{
- struct tgsi_parse_context parse;
- unsigned ok = 1;
- uint num_immediates = 0;
-
- util_init_math();
-
- func->csr = func->store;
-
- tgsi_parse_init( &parse, tokens );
-
- /* Can't just use EDI, EBX without save/restoring them:
- */
- x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
- x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
-
- /*
- * Different function args for vertex/fragment shaders:
- */
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
- if (do_swizzles)
- aos_to_soa( func,
- 4, /* aos_input */
- 1, /* machine */
- 5, /* num_inputs */
- 6 ); /* input_stride */
- }
-
- x86_mov(
- func,
- get_machine_base(),
- x86_fn_arg( func, 1 ) );
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 2 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 3 ) );
-
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- x86_mov(
- func,
- get_coef_base(),
- x86_fn_arg( func, 4 ) );
- }
-
- x86_mov(
- func,
- get_sampler_base(),
- x86_make_disp( get_machine_base(),
- Offset( struct tgsi_exec_machine, Samplers ) ) );
-
- while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
- tgsi_parse_token( &parse );
-
- switch( parse.FullToken.Token.Type ) {
- case TGSI_TOKEN_TYPE_DECLARATION:
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- emit_declaration(
- func,
- &parse.FullToken.FullDeclaration );
- }
- break;
-
- case TGSI_TOKEN_TYPE_INSTRUCTION:
- ok = emit_instruction(
- func,
- &parse.FullToken.FullInstruction );
-
- if (!ok) {
- uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
- uint proc = parse.FullHeader.Processor.Processor;
- debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
- opcode,
- tgsi_get_opcode_name(opcode),
- tgsi_get_processor_name(proc));
- }
-
- if (ok)
- ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
- break;
-
- case TGSI_TOKEN_TYPE_IMMEDIATE:
- /* simply copy the immediate values into the next immediates[] slot */
- {
- const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
- uint i;
- assert(size <= 4);
- assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
- for( i = 0; i < size; i++ ) {
- immediates[num_immediates][i] =
- parse.FullToken.FullImmediate.u[i].Float;
- }
-#if 0
- debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
- num_immediates,
- immediates[num_immediates][0],
- immediates[num_immediates][1],
- immediates[num_immediates][2],
- immediates[num_immediates][3]);
-#endif
- num_immediates++;
- }
- break;
- case TGSI_TOKEN_TYPE_PROPERTY:
- /* we just ignore them for now */
- break;
-
- default:
- ok = 0;
- assert( 0 );
- }
- }
-
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
- if (do_swizzles)
- soa_to_aos( func,
- 7, /* aos_output */
- 1, /* machine */
- 8, /* num_outputs */
- 9 ); /* output_stride */
- }
-
- /* Can't just use EBX, EDI without save/restoring them:
- */
- x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
- x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-
- emit_ret( func );
-
- tgsi_parse_free( &parse );
-
- return ok;
-}
-
-#else /* !PIPE_ARCH_X86 */
-
-unsigned
-tgsi_emit_sse2(
- const struct tgsi_token *tokens,
- struct x86_function *func,
- float (*immediates)[4],
- boolean do_swizzles )
-{
- return 0;
-}
-
-#endif /* !PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
deleted file mode 100644
index 00aa8b84fe9..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef TGSI_SSE2_H
-#define TGSI_SSE2_H
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#include "pipe/p_compiler.h"
-
-struct tgsi_exec_machine;
-struct tgsi_interp_coef;
-struct tgsi_token;
-struct x86_function;
-
-unsigned
-tgsi_emit_sse2(
- const struct tgsi_token *tokens,
- struct x86_function *function,
- float (*immediates)[4],
- boolean do_swizzles );
-
-
-/* This is the function prototype generated when do_swizzles is false
- * -- effectively for fragment shaders.
- */
-typedef void (PIPE_CDECL *tgsi_sse2_fs_function) (
- struct tgsi_exec_machine *machine, /* 1 */
- const float (*constant)[4], /* 2 */
- const float (*immediate)[4], /* 3 */
- const struct tgsi_interp_coef *coef /* 4 */
- );
-
-
-/* This is the function prototype generated when do_swizzles is true
- * -- effectively for vertex shaders.
- */
-typedef void (PIPE_CDECL *tgsi_sse2_vs_func) (
- struct tgsi_exec_machine *machine, /* 1 */
- const float (*constant)[4], /* 2 */
- const float (*immediate)[4], /* 3 */
- const float (*aos_input)[4], /* 4 */
- uint num_inputs, /* 5 */
- uint input_stride, /* 6 */
- float (*aos_output)[4], /* 7 */
- uint num_outputs, /* 8 */
- uint output_stride ); /* 9 */
-
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_SSE2_H */
diff --git a/src/gallium/drivers/softpipe/Android.mk b/src/gallium/drivers/softpipe/Android.mk
index d198fa5d0f2..6a125a5d412 100644
--- a/src/gallium/drivers/softpipe/Android.mk
+++ b/src/gallium/drivers/softpipe/Android.mk
@@ -26,7 +26,6 @@ LOCAL_PATH := $(call my-dir)
# from Makefile
C_SOURCES = \
sp_fs_exec.c \
- sp_fs_sse.c \
sp_clear.c \
sp_fence.c \
sp_flush.c \
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 9403e6cf0b8..27b5d991a75 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -5,7 +5,6 @@ LIBNAME = softpipe
C_SOURCES = \
sp_fs_exec.c \
- sp_fs_sse.c \
sp_clear.c \
sp_fence.c \
sp_flush.c \
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index ea10e8a9f98..da2c93ee5fa 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -6,7 +6,6 @@ softpipe = env.ConvenienceLibrary(
target = 'softpipe',
source = [
'sp_fs_exec.c',
- 'sp_fs_sse.c',
'sp_clear.c',
'sp_context.c',
'sp_draw_arrays.c',
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index c97b0333035..3a83e5870dc 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -235,12 +235,6 @@ softpipe_create_context( struct pipe_screen *screen,
util_init_math();
-#ifdef PIPE_ARCH_X86
- softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE );
-#else
- softpipe->use_sse = FALSE;
-#endif
-
softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d51ce9fe333..5442aba9019 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -190,7 +190,6 @@ struct softpipe_context {
struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
struct softpipe_tex_tile_cache *geometry_tex_cache[PIPE_MAX_GEOMETRY_SAMPLERS];
- unsigned use_sse : 1;
unsigned dump_fs : 1;
unsigned dump_gs : 1;
unsigned no_rast : 1;
diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h
index d46d7d5a657..db689b82bd5 100644
--- a/src/gallium/drivers/softpipe/sp_fs.h
+++ b/src/gallium/drivers/softpipe/sp_fs.h
@@ -36,10 +36,6 @@ struct sp_fragment_shader_variant *
softpipe_create_fs_variant_exec(struct softpipe_context *softpipe,
const struct pipe_shader_state *templ);
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
- const struct pipe_shader_state *templ);
-
struct tgsi_interp_coef;
struct tgsi_exec_vector;
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
deleted file mode 100644
index c873af125bd..00000000000
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * Execute fragment shader using runtime SSE code generation.
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-#include "sp_quad.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_sse2.h"
-
-
-#if defined(PIPE_ARCH_X86)
-
-#include "rtasm/rtasm_x86sse.h"
-
-
-
-/**
- * Subclass of sp_fragment_shader_variant
- */
-struct sp_sse_fragment_shader
-{
- struct sp_fragment_shader_variant base;
- struct x86_function sse2_program;
- tgsi_sse2_fs_function func;
- float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
-};
-
-
-/** cast wrapper */
-static INLINE struct sp_sse_fragment_shader *
-sp_sse_fragment_shader(const struct sp_fragment_shader_variant *base)
-{
- return (struct sp_sse_fragment_shader *) base;
-}
-
-
-static void
-fs_sse_prepare( const struct sp_fragment_shader_variant *base,
- struct tgsi_exec_machine *machine,
- struct tgsi_sampler **samplers )
-{
- machine->Samplers = samplers;
-}
-
-
-
-/**
- * Compute quad X,Y,Z,W for the four fragments in a quad.
- *
- * This should really be part of the compiled shader.
- */
-static void
-setup_pos_vector(const struct tgsi_interp_coef *coef,
- float x, float y,
- struct tgsi_exec_vector *quadpos)
-{
- uint chan;
- /* do X */
- quadpos->xyzw[0].f[0] = x;
- quadpos->xyzw[0].f[1] = x + 1;
- quadpos->xyzw[0].f[2] = x;
- quadpos->xyzw[0].f[3] = x + 1;
-
- /* do Y */
- quadpos->xyzw[1].f[0] = y;
- quadpos->xyzw[1].f[1] = y;
- quadpos->xyzw[1].f[2] = y + 1;
- quadpos->xyzw[1].f[3] = y + 1;
-
- /* do Z and W for all fragments in the quad */
- for (chan = 2; chan < 4; chan++) {
- const float dadx = coef->dadx[chan];
- const float dady = coef->dady[chan];
- const float a0 = coef->a0[chan] + dadx * x + dady * y;
- quadpos->xyzw[chan].f[0] = a0;
- quadpos->xyzw[chan].f[1] = a0 + dadx;
- quadpos->xyzw[chan].f[2] = a0 + dady;
- quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
- }
-}
-
-
-/* TODO: codegenerate the whole run function, skip this wrapper.
- * TODO: break dependency on tgsi_exec_machine struct
- * TODO: push Position calculation into the generated shader
- * TODO: process >1 quad at a time
- */
-static unsigned
-fs_sse_run( const struct sp_fragment_shader_variant *base,
- struct tgsi_exec_machine *machine,
- struct quad_header *quad )
-{
- struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
-
- /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
- setup_pos_vector(quad->posCoef,
- (float)quad->input.x0, (float)quad->input.y0,
- machine->Temps);
-
- /* init kill mask */
- tgsi_set_kill_mask(machine, 0x0);
- tgsi_set_exec_mask(machine, 1, 1, 1, 1);
-
- shader->func( machine,
- (const float (*)[4])machine->Consts[0],
- (const float (*)[4])shader->immediates,
- machine->InterpCoefs
- /*, &machine->QuadPos*/
- );
-
- quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
- if (quad->inout.mask == 0)
- return FALSE;
-
- /* store outputs */
- {
- const ubyte *sem_name = base->info.output_semantic_name;
- const ubyte *sem_index = base->info.output_semantic_index;
- const uint n = base->info.num_outputs;
- uint i;
- for (i = 0; i < n; i++) {
- switch (sem_name[i]) {
- case TGSI_SEMANTIC_COLOR:
- {
- uint cbuf = sem_index[i];
-
- assert(sizeof(quad->output.color[cbuf]) ==
- sizeof(machine->Outputs[i]));
-
- /* copy float[4][4] result */
- memcpy(quad->output.color[cbuf],
- &machine->Outputs[i],
- sizeof(quad->output.color[0]) );
- }
- break;
- case TGSI_SEMANTIC_POSITION:
- {
- uint j;
- for (j = 0; j < 4; j++)
- quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
- }
- break;
- case TGSI_SEMANTIC_STENCIL:
- {
- uint j;
- for (j = 0; j < 4; j++)
- quad->output.stencil[j] = machine->Outputs[i].xyzw[1].f[j];
- }
- break;
- }
- }
- }
-
- return TRUE;
-}
-
-
-static void
-fs_sse_delete( struct sp_fragment_shader_variant *base )
-{
- struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
-
- x86_release_func( &shader->sse2_program );
- FREE(shader);
-}
-
-
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
- const struct pipe_shader_state *templ)
-{
- struct sp_sse_fragment_shader *shader;
-
- if (!softpipe->use_sse)
- return NULL;
-
- shader = CALLOC_STRUCT(sp_sse_fragment_shader);
- if (!shader)
- return NULL;
-
- x86_init_func( &shader->sse2_program );
-
- if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
- shader->immediates, FALSE )) {
- FREE(shader);
- return NULL;
- }
-
- shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );
- if (!shader->func) {
- x86_release_func( &shader->sse2_program );
- FREE(shader);
- return NULL;
- }
-
- shader->base.prepare = fs_sse_prepare;
- shader->base.run = fs_sse_run;
- shader->base.delete = fs_sse_delete;
-
- return &shader->base;
-}
-
-
-#else
-
-/* Maybe put this variant in the header file.
- */
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
- const struct pipe_shader_state *templ)
-{
- return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index 612dcb38eb4..6acb57b3fe6 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -65,10 +65,7 @@ create_fs_variant(struct softpipe_context *softpipe,
#endif
/* codegen, create variant object */
- var = softpipe_create_fs_variant_sse(softpipe, curfs);
- if (!var) {
- var = softpipe_create_fs_variant_exec(softpipe, curfs);
- }
+ var = softpipe_create_fs_variant_exec(softpipe, curfs);
if (var) {
var->key = *key;