diff options
Diffstat (limited to 'src/gallium/auxiliary/tgsi/tgsi_sse2.c')
-rw-r--r-- | src/gallium/auxiliary/tgsi/tgsi_sse2.c | 3106 |
1 files changed, 0 insertions, 3106 deletions
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c deleted file mode 100644 index 5614caf63e7..00000000000 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ /dev/null @@ -1,3106 +0,0 @@ -/************************************************************************** - * - * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * Copyright 2009-2010 VMware, Inc. All rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "pipe/p_config.h" - -#include "tgsi/tgsi_sse2.h" - -#if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */ - -#include "util/u_debug.h" -#include "pipe/p_shader_tokens.h" -#include "util/u_math.h" -#include "util/u_memory.h" -#if defined(PIPE_ARCH_SSE) -#include "util/u_sse.h" -#endif -#include "tgsi/tgsi_info.h" -#include "tgsi/tgsi_parse.h" -#include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_exec.h" - -#include "rtasm/rtasm_x86sse.h" - -/* for 1/sqrt() - * - * This costs about 100fps (close to 10%) in gears: - */ -#define HIGH_PRECISION 1 - -#define FAST_MATH 1 - - -#define FOR_EACH_CHANNEL( CHAN )\ - for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++) - -#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ - ((INST).Dst[0].Register.WriteMask & (1 << (CHAN))) - -#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ - if (IS_DST0_CHANNEL_ENABLED( INST, CHAN )) - -#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\ - FOR_EACH_CHANNEL( CHAN )\ - IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN ) - -#define CHAN_X 0 -#define CHAN_Y 1 -#define CHAN_Z 2 -#define CHAN_W 3 - -#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I -#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C - -#define TEMP_R0 TGSI_EXEC_TEMP_R0 -#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR -#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I -#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C - - -/** - * X86 utility functions. - */ - -static struct x86_reg -make_xmm( - unsigned xmm ) -{ - return x86_make_reg( - file_XMM, - (enum x86_reg_name) xmm ); -} - -/** - * X86 register mapping helpers. - */ - -static struct x86_reg -get_const_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_AX ); -} - -static struct x86_reg -get_machine_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_CX ); -} - -static struct x86_reg -get_input_base( void ) -{ - /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Inputs) ); -} - -static struct x86_reg -get_output_base( void ) -{ - /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Outputs) ); -} - -static struct x86_reg -get_temp_base( void ) -{ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, Temps) ); -} - -static struct x86_reg -get_coef_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_BX ); -} - -static struct x86_reg -get_sampler_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_DI ); -} - -static struct x86_reg -get_immediate_base( void ) -{ - return x86_make_reg( - file_REG32, - reg_DX ); -} - -static struct x86_reg -get_system_value_base( void ) -{ - return x86_make_disp( - get_machine_base(), - Offset(struct tgsi_exec_machine, SystemValue) ); -} - - -/** - * Data access helpers. - */ - - -static struct x86_reg -get_immediate( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_immediate_base(), - (vec * 4 + chan) * 4 ); -} - -static struct x86_reg -get_const( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_const_base(), - (vec * 4 + chan) * 4 ); -} - -static struct x86_reg -get_sampler_ptr( - unsigned unit ) -{ - return x86_make_disp( - get_sampler_base(), - unit * sizeof( struct tgsi_sampler * ) ); -} - -static struct x86_reg -get_input( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_input_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_output( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_output_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_temp( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_temp_base(), - (vec * 4 + chan) * 16 ); -} - -static struct x86_reg -get_system_value( - unsigned vec, - unsigned chan ) -{ - return x86_make_disp( - get_system_value_base(), /* base */ - (vec * 4 + chan) * 4 ); /* byte offset from base */ -} - -static struct x86_reg -get_coef( - unsigned vec, - unsigned chan, - unsigned member ) -{ - return x86_make_disp( - get_coef_base(), - ((vec * 3 + member) * 4 + chan) * 4 ); -} - - -static void -emit_ret( - struct x86_function *func ) -{ - x86_ret( func ); -} - - -/** - * Data fetch helpers. - */ - -/** - * Copy a shader constant to xmm register - * \param xmm the destination xmm register - * \param vec the src const buffer index - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_const( - struct x86_function *func, - uint xmm, - int vec, - uint chan, - uint indirect, - uint indirectFile, - int indirectIndex ) -{ - if (indirect) { - /* 'vec' is the offset from the address register's value. - * We're loading CONST[ADDR+vec] into an xmm register. - */ - struct x86_reg r0 = get_immediate_base(); - struct x86_reg r1 = get_coef_base(); - uint i; - - assert( indirectFile == TGSI_FILE_ADDRESS ); - assert( indirectIndex == 0 ); - assert( r0.mod == mod_REG ); - assert( r1.mod == mod_REG ); - - x86_push( func, r0 ); - x86_push( func, r1 ); - - /* - * Loop over the four pixels or vertices in the quad. - * Get the value of the address (offset) register for pixel/vertex[i], - * add it to the src offset and index into the constant buffer. - * Note that we're working on SOA data. - * If any of the pixel/vertex execution channels are unused their - * values will be garbage. It's very important that we don't use - * those garbage values as indexes into the constant buffer since - * that'll cause segfaults. - * The solution is to bitwise-AND the offset with the execution mask - * register whose values are either 0 or ~0. - * The caller must setup the execution mask register to indicate - * which channels are valid/alive before running the shader. - * The execution mask will also figure into loops and conditionals - * someday. - */ - for (i = 0; i < QUAD_SIZE; i++) { - /* r1 = address register[i] */ - x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) ); - /* r0 = execution mask[i] */ - x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) ); - /* r1 = r1 & r0 */ - x86_and( func, r1, r0 ); - /* r0 = 'vec', the offset */ - x86_lea( func, r0, get_const( vec, chan ) ); - - /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm. - */ - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - x86_add( func, r1, r1 ); - - x86_add( func, r0, r1 ); /* r0 = r0 + r1 */ - x86_mov( func, r1, x86_deref( r0 ) ); - x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 ); - } - - x86_pop( func, r1 ); - x86_pop( func, r0 ); - - sse_movaps( - func, - make_xmm( xmm ), - get_temp( TEMP_R0, CHAN_X ) ); - } - else { - /* 'vec' is the index into the src register file, such as TEMP[vec] */ - assert( vec >= 0 ); - - sse_movss( - func, - make_xmm( xmm ), - get_const( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); - } -} - -static void -emit_immediate( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_immediate( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - - -/** - * Copy a shader input to xmm register - * \param xmm the destination xmm register - * \param vec the src input attrib - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_inputf( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - make_xmm( xmm ), - get_input( vec, chan ) ); -} - -/** - * Store an xmm register to a shader output - * \param xmm the source xmm register - * \param vec the dest output attrib - * \param chan src dest channel to store (X, Y, Z or W) - */ -static void -emit_output( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - get_output( vec, chan ), - make_xmm( xmm ) ); -} - -/** - * Copy a shader temporary to xmm register - * \param xmm the destination xmm register - * \param vec the src temp register - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_tempf( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movaps( - func, - make_xmm( xmm ), - get_temp( vec, chan ) ); -} - -/** - * Copy a system value to xmm register - * \param xmm the destination xmm register - * \param vec the source system value register - * \param chan src channel to fetch (X, Y, Z or W) - */ -static void -emit_system_value( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_system_value( vec, chan ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - -/** - * Load an xmm register with an input attrib coefficient (a0, dadx or dady) - * \param xmm the destination xmm register - * \param vec the src input/attribute coefficient index - * \param chan src channel to fetch (X, Y, Z or W) - * \param member 0=a0, 1=dadx, 2=dady - */ -static void -emit_coef( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan, - unsigned member ) -{ - sse_movss( - func, - make_xmm( xmm ), - get_coef( vec, chan, member ) ); - sse_shufps( - func, - make_xmm( xmm ), - make_xmm( xmm ), - SHUF( 0, 0, 0, 0 ) ); -} - -/** - * Data store helpers. - */ - -static void -emit_inputs( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movups( - func, - get_input( vec, chan ), - make_xmm( xmm ) ); -} - -static void -emit_temps( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - sse_movaps( - func, - get_temp( vec, chan ), - make_xmm( xmm ) ); -} - -static void -emit_addrs( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - assert( vec == 0 ); - - emit_temps( - func, - xmm, - vec + TGSI_EXEC_TEMP_ADDR, - chan ); -} - -/** - * Coefficent fetch helpers. - */ - -static void -emit_coef_a0( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 0 ); -} - -static void -emit_coef_dadx( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 1 ); -} - -static void -emit_coef_dady( - struct x86_function *func, - unsigned xmm, - unsigned vec, - unsigned chan ) -{ - emit_coef( - func, - xmm, - vec, - chan, - 2 ); -} - -/** - * Function call helpers. - */ - -/** - * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be - * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee - * that the stack pointer is 16 byte aligned, as expected. - */ -static void -emit_func_call( - struct x86_function *func, - unsigned xmm_save_mask, - const struct x86_reg *arg, - unsigned nr_args, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - unsigned i, n; - - x86_push( - func, - x86_make_reg( file_REG32, reg_AX) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_CX) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_DX) ); - - /* Store XMM regs to the stack - */ - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) - ++n; - - x86_sub_imm( - func, - x86_make_reg( file_REG32, reg_SP ), - n*16); - - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) { - sse_movups( - func, - x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ), - make_xmm( i ) ); - ++n; - } - - for (i = 0; i < nr_args; i++) { - /* Load the address of the buffer we use for passing arguments and - * receiving results: - */ - x86_lea( - func, - ecx, - arg[i] ); - - /* Push actual function arguments (currently just the pointer to - * the buffer above), and call the function: - */ - x86_push( func, ecx ); - } - - x86_mov_reg_imm( func, ecx, (unsigned long) code ); - x86_call( func, ecx ); - - /* Pop the arguments (or just add an immediate to esp) - */ - for (i = 0; i < nr_args; i++) { - x86_pop(func, ecx ); - } - - /* Pop the saved XMM regs: - */ - for(i = 0, n = 0; i < 8; ++i) - if(xmm_save_mask & (1 << i)) { - sse_movups( - func, - make_xmm( i ), - x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) ); - ++n; - } - - x86_add_imm( - func, - x86_make_reg( file_REG32, reg_SP ), - n*16); - - /* Restore GP registers in a reverse order. - */ - x86_pop( - func, - x86_make_reg( file_REG32, reg_DX) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_CX) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_AX) ); -} - -static void -emit_func_call_dst_src1( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg store = get_temp( TEMP_R0, 0 ); - unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); - - /* Store our input parameters (in xmm regs) to the buffer we use - * for passing arguments. We will pass a pointer to this buffer as - * the actual function argument. - */ - sse_movaps( - func, - store, - make_xmm( xmm_src0 ) ); - - emit_func_call( func, - xmm_mask, - &store, - 1, - code ); - - sse_movaps( - func, - make_xmm( xmm_dst ), - store ); -} - - -static void -emit_func_call_dst_src2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - unsigned xmm_src1, - void (PIPE_CDECL *code)() ) -{ - struct x86_reg store = get_temp( TEMP_R0, 0 ); - unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst); - - /* Store two inputs to parameter buffer. - */ - sse_movaps( - func, - store, - make_xmm( xmm_src0 ) ); - - sse_movaps( - func, - x86_make_disp( store, 4 * sizeof(float) ), - make_xmm( xmm_src1 ) ); - - - /* Emit the call - */ - emit_func_call( func, - xmm_mask, - &store, - 1, - code ); - - /* Retrieve the results: - */ - sse_movaps( - func, - make_xmm( xmm_dst ), - store ); -} - - - - - -#if defined(PIPE_ARCH_SSE) - -/* - * Fast SSE2 implementation of special math functions. - */ - -#define POLY0(x, c0) _mm_set1_ps(c0) -#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) -#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) -#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) -#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) -#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) - -#define EXP_POLY_DEGREE 3 -#define LOG_POLY_DEGREE 5 - -/** - * See http://www.devmaster.net/forums/showthread.php?p=43580 - */ -static INLINE __m128 -exp2f4(__m128 x) -{ - __m128i ipart; - __m128 fpart, expipart, expfpart; - - x = _mm_min_ps(x, _mm_set1_ps( 129.00000f)); - x = _mm_max_ps(x, _mm_set1_ps(-126.99999f)); - - /* ipart = int(x - 0.5) */ - ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); - - /* fpart = x - ipart */ - fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); - - /* expipart = (float) (1 << ipart) */ - expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - - /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ -#if EXP_POLY_DEGREE == 5 - expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); -#elif EXP_POLY_DEGREE == 4 - expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f); -#elif EXP_POLY_DEGREE == 3 - expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f); -#elif EXP_POLY_DEGREE == 2 - expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f); -#else -#error -#endif - - return _mm_mul_ps(expipart, expfpart); -} - - -/** - * See http://www.devmaster.net/forums/showthread.php?p=43580 - */ -static INLINE __m128 -log2f4(__m128 x) -{ - __m128i expmask = _mm_set1_epi32(0x7f800000); - __m128i mantmask = _mm_set1_epi32(0x007fffff); - __m128 one = _mm_set1_ps(1.0f); - - __m128i i = _mm_castps_si128(x); - - /* exp = (float) exponent(x) */ - __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127))); - - /* mant = (float) mantissa(x) */ - __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one); - - __m128 logmant; - - /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ - * These coefficients can be generate with - * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html - */ -#if LOG_POLY_DEGREE == 6 - logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f); -#elif LOG_POLY_DEGREE == 5 - logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); -#elif LOG_POLY_DEGREE == 4 - logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); -#elif LOG_POLY_DEGREE == 3 - logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); -#else -#error -#endif - - /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ - logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one)); - - return _mm_add_ps(logmant, exp); -} - - -static INLINE __m128 -powf4(__m128 x, __m128 y) -{ - return exp2f4(_mm_mul_ps(log2f4(x), y)); -} - -#endif /* PIPE_ARCH_SSE */ - - - -/** - * Low-level instruction translators. - */ - -static void -emit_abs( - struct x86_function *func, - unsigned xmm ) -{ - sse_andps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_7FFFFFFF_I, - TGSI_EXEC_TEMP_7FFFFFFF_C ) ); -} - -static void -emit_add( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_addps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void PIPE_CDECL -cos4f( - float *store ) -{ - store[0] = cosf( store[0] ); - store[1] = cosf( store[1] ); - store[2] = cosf( store[2] ); - store[3] = cosf( store[3] ); -} - -static void -emit_cos( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - cos4f ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -ex24f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); -#else - store[0] = util_fast_exp2( store[0] ); - store[1] = util_fast_exp2( store[1] ); - store[2] = util_fast_exp2( store[2] ); - store[3] = util_fast_exp2( store[3] ); -#endif -} - -static void -emit_ex2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - ex24f ); -} - -static void -emit_f2it( - struct x86_function *func, - unsigned xmm ) -{ - sse2_cvttps2dq( - func, - make_xmm( xmm ), - make_xmm( xmm ) ); -} - -static void -emit_i2f( - struct x86_function *func, - unsigned xmm ) -{ - sse2_cvtdq2ps( - func, - make_xmm( xmm ), - make_xmm( xmm ) ); -} - -static void PIPE_CDECL -flr4f( - float *store ) -{ - store[0] = floorf( store[0] ); - store[1] = floorf( store[1] ); - store[2] = floorf( store[2] ); - store[3] = floorf( store[3] ); -} - -static void -emit_flr( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - flr4f ); -} - -static void PIPE_CDECL -frc4f( - float *store ) -{ - store[0] -= floorf( store[0] ); - store[1] -= floorf( store[1] ); - store[2] -= floorf( store[2] ); - store[3] -= floorf( store[3] ); -} - -static void -emit_frc( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - frc4f ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -lg24f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); -#else - store[0] = util_fast_log2( store[0] ); - store[1] = util_fast_log2( store[1] ); - store[2] = util_fast_log2( store[2] ); - store[3] = util_fast_log2( store[3] ); -#endif -} - -static void -emit_lg2( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - lg24f ); -} - -static void -emit_MOV( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_movups( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void -emit_mul (struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src) -{ - sse_mulps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void -emit_neg( - struct x86_function *func, - unsigned xmm ) -{ - sse_xorps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_80000000_I, - TGSI_EXEC_TEMP_80000000_C ) ); -} - -static void PIPE_CDECL -#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE) -__attribute__((force_align_arg_pointer)) -#endif -pow4f( - float *store ) -{ -#if defined(PIPE_ARCH_SSE) - _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); -#else - store[0] = util_fast_pow( store[0], store[4] ); - store[1] = util_fast_pow( store[1], store[5] ); - store[2] = util_fast_pow( store[2], store[6] ); - store[3] = util_fast_pow( store[3], store[7] ); -#endif -} - -static void -emit_pow( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst, - unsigned xmm_src0, - unsigned xmm_src1 ) -{ - emit_func_call_dst_src2( - func, - xmm_save, - xmm_dst, - xmm_src0, - xmm_src1, - pow4f ); -} - -static void -emit_rcp ( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - /* On Intel CPUs at least, this is only accurate to 12 bits -- not - * good enough. Need to either emit a proper divide or use the - * iterative technique described below in emit_rsqrt(). - */ - sse2_rcpps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -static void PIPE_CDECL -rnd4f( - float *store ) -{ - store[0] = floorf( store[0] + 0.5f ); - store[1] = floorf( store[1] + 0.5f ); - store[2] = floorf( store[2] + 0.5f ); - store[3] = floorf( store[3] + 0.5f ); -} - -static void -emit_rnd( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - rnd4f ); -} - -static void -emit_rsqrt( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ -#if HIGH_PRECISION - /* Although rsqrtps() and rcpps() are low precision on some/all SSE - * implementations, it is possible to improve its precision at - * fairly low cost, using a newton/raphson step, as below: - * - * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) - * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] - * - * See: http://softwarecommunity.intel.com/articles/eng/1818.htm - */ - { - struct x86_reg dst = make_xmm( xmm_dst ); - struct x86_reg src = make_xmm( xmm_src ); - struct x86_reg tmp0 = make_xmm( 2 ); - struct x86_reg tmp1 = make_xmm( 3 ); - - assert( xmm_dst != xmm_src ); - assert( xmm_dst != 2 && xmm_dst != 3 ); - assert( xmm_src != 2 && xmm_src != 3 ); - - sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); - sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); - sse_rsqrtps( func, tmp1, src ); - sse_mulps( func, src, tmp1 ); - sse_mulps( func, dst, tmp1 ); - sse_mulps( func, src, tmp1 ); - sse_subps( func, tmp0, src ); - sse_mulps( func, dst, tmp0 ); - } -#else - /* On Intel CPUs at least, this is only accurate to 12 bits -- not - * good enough. - */ - sse_rsqrtps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -#endif -} - -static void -emit_setsign( - struct x86_function *func, - unsigned xmm ) -{ - sse_orps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_80000000_I, - TGSI_EXEC_TEMP_80000000_C ) ); -} - -static void PIPE_CDECL -sgn4f( - float *store ) -{ - store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f; - store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f; - store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f; - store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f; -} - -static void -emit_sgn( - struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst ) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - sgn4f ); -} - -static void PIPE_CDECL -sin4f( - float *store ) -{ - store[0] = sinf( store[0] ); - store[1] = sinf( store[1] ); - store[2] = sinf( store[2] ); - store[3] = sinf( store[3] ); -} - -static void -emit_sin (struct x86_function *func, - unsigned xmm_save, - unsigned xmm_dst) -{ - emit_func_call_dst_src1( - func, - xmm_save, - xmm_dst, - xmm_dst, - sin4f ); -} - -static void -emit_sub( - struct x86_function *func, - unsigned xmm_dst, - unsigned xmm_src ) -{ - sse_subps( - func, - make_xmm( xmm_dst ), - make_xmm( xmm_src ) ); -} - -/** - * Register fetch. - */ -static void -emit_fetch( - struct x86_function *func, - unsigned xmm, - const struct tgsi_full_src_register *reg, - const unsigned chan_index ) -{ - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index ); - - switch (swizzle) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: - switch (reg->Register.File) { - case TGSI_FILE_CONSTANT: - emit_const( - func, - xmm, - reg->Register.Index, - swizzle, - reg->Register.Indirect, - reg->Indirect.File, - reg->Indirect.Index ); - break; - - case TGSI_FILE_IMMEDIATE: - emit_immediate( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_SYSTEM_VALUE: - emit_system_value( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_INPUT: - emit_inputf( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - case TGSI_FILE_TEMPORARY: - emit_tempf( - func, - xmm, - reg->Register.Index, - swizzle ); - break; - - default: - assert( 0 ); - } - break; - - default: - assert( 0 ); - } - - switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) { - case TGSI_UTIL_SIGN_CLEAR: - emit_abs( func, xmm ); - break; - - case TGSI_UTIL_SIGN_SET: - emit_setsign( func, xmm ); - break; - - case TGSI_UTIL_SIGN_TOGGLE: - emit_neg( func, xmm ); - break; - - case TGSI_UTIL_SIGN_KEEP: - break; - } -} - -#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\ - emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN ) - -/** - * Register store. - */ -static void -emit_store( - struct x86_function *func, - unsigned xmm, - const struct tgsi_full_dst_register *reg, - const struct tgsi_full_instruction *inst, - unsigned chan_index ) -{ - switch( inst->Instruction.Saturate ) { - case TGSI_SAT_NONE: - break; - - case TGSI_SAT_ZERO_ONE: - sse_maxps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - - sse_minps( - func, - make_xmm( xmm ), - get_temp( - TGSI_EXEC_TEMP_ONE_I, - TGSI_EXEC_TEMP_ONE_C ) ); - break; - - case TGSI_SAT_MINUS_PLUS_ONE: - assert( 0 ); - break; - } - - - switch( reg->Register.File ) { - case TGSI_FILE_OUTPUT: - emit_output( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - case TGSI_FILE_TEMPORARY: - emit_temps( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - case TGSI_FILE_ADDRESS: - emit_addrs( - func, - xmm, - reg->Register.Index, - chan_index ); - break; - - default: - assert( 0 ); - } -} - -#define STORE( FUNC, INST, XMM, INDEX, CHAN )\ - emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN ) - - -static void PIPE_CDECL -fetch_texel( struct tgsi_sampler **sampler, - float *store ) -{ -#if 0 - uint j; - - debug_printf("%s sampler: %p (%p) store: %p\n", - __FUNCTION__, - sampler, *sampler, - store ); - - for (j = 0; j < 4; j++) - debug_printf("sample %d texcoord %f %f %f lodbias %f\n", - j, - store[0+j], - store[4+j], - store[8 + j], - store[12 + j]); -#endif - - { - float rgba[NUM_CHANNELS][QUAD_SIZE]; - (*sampler)->get_samples(*sampler, - &store[0], /* s */ - &store[4], /* t */ - &store[8], /* r */ - &store[12], /* lodbias */ - tgsi_sampler_lod_bias, - rgba); /* results */ - - memcpy( store, rgba, 16 * sizeof(float)); - } - -#if 0 - for (j = 0; j < 4; j++) - debug_printf("sample %d result %f %f %f %f\n", - j, - store[0+j], - store[4+j], - store[8+j], - store[12+j]); -#endif -} - -/** - * High-level instruction translators. - */ -static void -emit_tex( struct x86_function *func, - const struct tgsi_full_instruction *inst, - boolean lodbias, - boolean projected) -{ - const uint unit = inst->Src[1].Register.Index; - struct x86_reg args[2]; - unsigned count; - unsigned i; - - assert(inst->Instruction.Texture); - switch (inst->Texture.Texture) { - case TGSI_TEXTURE_1D: - count = 1; - break; - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - case TGSI_TEXTURE_1D_ARRAY: - count = 2; - break; - case TGSI_TEXTURE_SHADOW1D: - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - case TGSI_TEXTURE_2D_ARRAY: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - count = 3; - break; - case TGSI_TEXTURE_SHADOW2D_ARRAY: - count = 4; - break; - default: - assert(0); - return; - } - - if (lodbias) { - FETCH( func, *inst, 3, 0, 3 ); - } - else { - emit_tempf( - func, - 3, - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ); - - } - - /* store lodbias whether enabled or not -- fetch_texel currently - * respects it always. - */ - sse_movaps( func, - get_temp( TEMP_R0, 3 ), - make_xmm( 3 ) ); - - if (projected) { - FETCH( func, *inst, 3, 0, 3 ); - - emit_rcp( func, 3, 3 ); - } - - for (i = 0; i < count; i++) { - FETCH( func, *inst, i, 0, i ); - - if (projected) { - sse_mulps( - func, - make_xmm( i ), - make_xmm( 3 ) ); - } - - /* Store in the argument buffer: - */ - sse_movaps( - func, - get_temp( TEMP_R0, i ), - make_xmm( i ) ); - } - - args[0] = get_temp( TEMP_R0, 0 ); - args[1] = get_sampler_ptr( unit ); - - emit_func_call( func, - 0, - args, - Elements(args), - fetch_texel ); - - /* If all four channels are enabled, could use a pointer to - * dst[0].x instead of TEMP_R0 for store? - */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) { - - sse_movaps( - func, - make_xmm( 0 ), - get_temp( TEMP_R0, i ) ); - - STORE( func, *inst, 0, 0, i ); - } -} - - -static void -emit_kil( - struct x86_function *func, - const struct tgsi_full_src_register *reg ) -{ - unsigned uniquemask; - unsigned unique_count = 0; - unsigned chan_index; - unsigned i; - - /* This mask stores component bits that were already tested. Note that - * we test if the value is less than zero, so 1.0 and 0.0 need not to be - * tested. - */ - uniquemask = 0; - - FOR_EACH_CHANNEL( chan_index ) { - unsigned swizzle; - - /* unswizzle channel */ - swizzle = tgsi_util_get_full_src_register_swizzle( - reg, - chan_index ); - - /* check if the component has not been already tested */ - if( !(uniquemask & (1 << swizzle)) ) { - uniquemask |= 1 << swizzle; - - /* allocate register */ - emit_fetch( - func, - unique_count++, - reg, - chan_index ); - } - } - - x86_push( - func, - x86_make_reg( file_REG32, reg_AX ) ); - x86_push( - func, - x86_make_reg( file_REG32, reg_DX ) ); - - for (i = 0 ; i < unique_count; i++ ) { - struct x86_reg dataXMM = make_xmm(i); - - sse_cmpps( - func, - dataXMM, - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ), - cc_LessThan ); - - if( i == 0 ) { - sse_movmskps( - func, - x86_make_reg( file_REG32, reg_AX ), - dataXMM ); - } - else { - sse_movmskps( - func, - x86_make_reg( file_REG32, reg_DX ), - dataXMM ); - x86_or( - func, - x86_make_reg( file_REG32, reg_AX ), - x86_make_reg( file_REG32, reg_DX ) ); - } - } - - x86_or( - func, - get_temp( - TGSI_EXEC_TEMP_KILMASK_I, - TGSI_EXEC_TEMP_KILMASK_C ), - x86_make_reg( file_REG32, reg_AX ) ); - - x86_pop( - func, - x86_make_reg( file_REG32, reg_DX ) ); - x86_pop( - func, - x86_make_reg( file_REG32, reg_AX ) ); -} - - -static void -emit_kilp( - struct x86_function *func ) -{ - /* XXX todo / fix me */ -} - - -static void -emit_setcc( - struct x86_function *func, - struct tgsi_full_instruction *inst, - enum sse_cc cc ) -{ - unsigned chan_index; - - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_cmpps( - func, - make_xmm( 0 ), - make_xmm( 1 ), - cc ); - sse_andps( - func, - make_xmm( 0 ), - get_temp( - TEMP_ONE_I, - TEMP_ONE_C ) ); - STORE( func, *inst, 0, 0, chan_index ); - } -} - -static void -emit_cmp( - struct x86_function *func, - struct tgsi_full_instruction *inst ) -{ - unsigned chan_index; - - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - sse_cmpps( - func, - make_xmm( 0 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ), - cc_LessThan ); - sse_andps( - func, - make_xmm( 1 ), - make_xmm( 0 ) ); - sse_andnps( - func, - make_xmm( 0 ), - make_xmm( 2 ) ); - sse_orps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } -} - - -/** - * Check if inst src/dest regs use indirect addressing into temporary, - * input or output register files. - */ -static boolean -indirect_reg_reference(const struct tgsi_full_instruction *inst) -{ - uint i; - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *reg = &inst->Src[i]; - if ((reg->Register.File == TGSI_FILE_TEMPORARY || - reg->Register.File == TGSI_FILE_INPUT || - reg->Register.File == TGSI_FILE_OUTPUT) && - reg->Register.Indirect) - return TRUE; - } - for (i = 0; i < inst->Instruction.NumDstRegs; i++) { - const struct tgsi_full_dst_register *reg = &inst->Dst[i]; - if ((reg->Register.File == TGSI_FILE_TEMPORARY || - reg->Register.File == TGSI_FILE_INPUT || - reg->Register.File == TGSI_FILE_OUTPUT) && - reg->Register.Indirect) - return TRUE; - } - return FALSE; -} - - -static int -emit_instruction( - struct x86_function *func, - struct tgsi_full_instruction *inst ) -{ - unsigned chan_index; - - /* we can't handle indirect addressing into temp register file yet */ - if (indirect_reg_reference(inst)) - return FALSE; - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ARL: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_flr(func, 0, 0); - emit_f2it( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_MOV: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 4 + chan_index, 0, chan_index ); - } - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 4 + chan_index, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LIT: - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C); - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) { - STORE( func, *inst, 0, 0, CHAN_X ); - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) { - STORE( func, *inst, 0, 0, CHAN_W ); - } - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - sse_maxps( - func, - make_xmm( 0 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - /* XMM[1] = SrcReg[0].yyyy */ - FETCH( func, *inst, 1, 0, CHAN_Y ); - /* XMM[1] = max(XMM[1], 0) */ - sse_maxps( - func, - make_xmm( 1 ), - get_temp( - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ) ); - /* XMM[2] = SrcReg[0].wwww */ - FETCH( func, *inst, 2, 0, CHAN_W ); - /* XMM[2] = min(XMM[2], 128.0) */ - sse_minps( - func, - make_xmm( 2 ), - get_temp( - TGSI_EXEC_TEMP_128_I, - TGSI_EXEC_TEMP_128_C ) ); - /* XMM[2] = max(XMM[2], -128.0) */ - sse_maxps( - func, - make_xmm( 2 ), - get_temp( - TGSI_EXEC_TEMP_MINUS_128_I, - TGSI_EXEC_TEMP_MINUS_128_C ) ); - emit_pow( func, 3, 1, 1, 2 ); - FETCH( func, *inst, 0, 0, CHAN_X ); - sse_xorps( - func, - make_xmm( 2 ), - make_xmm( 2 ) ); - sse_cmpps( - func, - make_xmm( 2 ), - make_xmm( 0 ), - cc_LessThan ); - sse_andps( - func, - make_xmm( 2 ), - make_xmm( 1 ) ); - STORE( func, *inst, 2, 0, CHAN_Z ); - } - } - break; - - case TGSI_OPCODE_RCP: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_rcp( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_RSQ: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_abs( func, 0 ); - emit_rsqrt( func, 1, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 1, 0, chan_index ); - } - break; - - case TGSI_OPCODE_EXP: - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( func, *inst, 0, 0, CHAN_X ); - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_MOV( func, 1, 0 ); - emit_flr( func, 2, 1 ); - /* dst.x = ex2(floor(src.x)) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { - emit_MOV( func, 2, 1 ); - emit_ex2( func, 3, 2 ); - STORE( func, *inst, 2, 0, CHAN_X ); - } - /* dst.y = src.x - floor(src.x) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_MOV( func, 2, 0 ); - emit_sub( func, 2, 1 ); - STORE( func, *inst, 2, 0, CHAN_Y ); - } - } - /* dst.z = ex2(src.x) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - emit_ex2( func, 3, 0 ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - } - /* dst.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) { - emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_LOG: - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_abs( func, 0 ); - emit_MOV( func, 1, 0 ); - emit_lg2( func, 2, 1 ); - /* dst.z = lg2(abs(src.x)) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - STORE( func, *inst, 1, 0, CHAN_Z ); - } - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_flr( func, 2, 1 ); - /* dst.x = floor(lg2(abs(src.x))) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { - STORE( func, *inst, 1, 0, CHAN_X ); - } - /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_ex2( func, 2, 1 ); - emit_rcp( func, 1, 1 ); - emit_mul( func, 0, 1 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - } - } - /* dst.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) { - emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_MUL: - /* do all fetches and adds, storing results in temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ - FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ - emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */ - } - /* do all stores of the temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ - } - break; - - case TGSI_OPCODE_ADD: - /* do all fetches and adds, storing results in temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ - FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ - emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */ - } - /* do all stores of the temp regs */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - int r = chan_index + 1; - STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ - } - break; - - case TGSI_OPCODE_DP3: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DP4: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul(func, 1, 2 ); - emit_add(func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_W ); - FETCH( func, *inst, 2, 1, CHAN_W ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DST: - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_X ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - FETCH( func, *inst, 0, 0, CHAN_Y ); - FETCH( func, *inst, 1, 1, CHAN_Y ); - emit_mul( func, 0, 1 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - FETCH( func, *inst, 0, 0, CHAN_Z ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - FETCH( func, *inst, 0, 1, CHAN_W ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_MIN: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_minps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_MAX: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - sse_maxps( - func, - make_xmm( 0 ), - make_xmm( 1 ) ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SLT: - emit_setcc( func, inst, cc_LessThan ); - break; - - case TGSI_OPCODE_SGE: - emit_setcc( func, inst, cc_NotLessThan ); - break; - - case TGSI_OPCODE_MAD: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - emit_mul( func, 0, 1 ); - emit_add( func, 0, 2 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SUB: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - emit_sub( func, 0, 1 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LRP: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - FETCH( func, *inst, 2, 2, chan_index ); - emit_sub( func, 1, 2 ); - emit_mul( func, 0, 1 ); - emit_add( func, 0, 2 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CND: - return 0; - break; - - case TGSI_OPCODE_DP2A: - FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ - FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ - emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ - FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ - FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ - emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FETCH( func, *inst, 1, 2, CHAN_X ); /* xmm1 = src[2].x */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ - } - break; - - case TGSI_OPCODE_FRC: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_frc( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CLAMP: - return 0; - break; - - case TGSI_OPCODE_FLR: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_flr( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_ROUND: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_rnd( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_EX2: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_ex2( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_LG2: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_lg2( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_POW: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_pow( func, 0, 0, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_XPD: - /* Note: we do all stores after all operands have been fetched - * to avoid src/dst register aliasing issues for an instruction - * such as: XPD TEMP[2].xyz, TEMP[0], TEMP[2]; - */ - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { - FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */ - FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */ - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */ - FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - emit_MOV( func, 7, 0 ); /* xmm[7] = xmm[0] */ - emit_mul( func, 7, 1 ); /* xmm[7] = xmm[2] * xmm[1] */ - emit_MOV( func, 5, 3 ); /* xmm[5] = xmm[3] */ - emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */ - emit_sub( func, 7, 5 ); /* xmm[7] = xmm[2] - xmm[5] */ - /* store xmm[7] in dst.x below */ - } - if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) || - IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { - FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */ - FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - emit_mul( func, 3, 2 ); /* xmm[3] = xmm[3] * xmm[2] */ - emit_mul( func, 1, 5 ); /* xmm[1] = xmm[1] * xmm[5] */ - emit_sub( func, 3, 1 ); /* xmm[3] = xmm[3] - xmm[1] */ - /* store xmm[3] in dst.y below */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - emit_mul( func, 5, 4 ); /* xmm[5] = xmm[5] * xmm[4] */ - emit_mul( func, 0, 2 ); /* xmm[0] = xmm[0] * xmm[2] */ - emit_sub( func, 5, 0 ); /* xmm[5] = xmm[5] - xmm[0] */ - STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */ - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_ABS: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_abs( func, 0) ; - - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_RCC: - return 0; - break; - - case TGSI_OPCODE_DPH: - FETCH( func, *inst, 0, 0, CHAN_X ); - FETCH( func, *inst, 1, 1, CHAN_X ); - emit_mul( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Y ); - FETCH( func, *inst, 2, 1, CHAN_Y ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 0, CHAN_Z ); - FETCH( func, *inst, 2, 1, CHAN_Z ); - emit_mul( func, 1, 2 ); - emit_add( func, 0, 1 ); - FETCH( func, *inst, 1, 1, CHAN_W ); - emit_add( func, 0, 1 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_COS: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_DDX: - return 0; - break; - - case TGSI_OPCODE_DDY: - return 0; - break; - - case TGSI_OPCODE_KILP: - /* predicated kill */ - emit_kilp( func ); - return 0; /* XXX fix me */ - break; - - case TGSI_OPCODE_KIL: - /* conditional kill */ - emit_kil( func, &inst->Src[0] ); - break; - - case TGSI_OPCODE_PK2H: - return 0; - break; - - case TGSI_OPCODE_PK2US: - return 0; - break; - - case TGSI_OPCODE_PK4B: - return 0; - break; - - case TGSI_OPCODE_PK4UB: - return 0; - break; - - case TGSI_OPCODE_RFL: - return 0; - break; - - case TGSI_OPCODE_SEQ: - emit_setcc( func, inst, cc_Equal ); - break; - - case TGSI_OPCODE_SFL: - return 0; - break; - - case TGSI_OPCODE_SGT: - emit_setcc( func, inst, cc_NotLessThanEqual ); - break; - - case TGSI_OPCODE_SIN: - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0, 0 ); - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SLE: - emit_setcc( func, inst, cc_LessThanEqual ); - break; - - case TGSI_OPCODE_SNE: - emit_setcc( func, inst, cc_NotEqual ); - break; - - case TGSI_OPCODE_STR: - return 0; - break; - - case TGSI_OPCODE_TEX: - emit_tex( func, inst, FALSE, FALSE ); - break; - - case TGSI_OPCODE_TXD: - return 0; - break; - - case TGSI_OPCODE_UP2H: - return 0; - break; - - case TGSI_OPCODE_UP2US: - return 0; - break; - - case TGSI_OPCODE_UP4B: - return 0; - break; - - case TGSI_OPCODE_UP4UB: - return 0; - break; - - case TGSI_OPCODE_X2D: - return 0; - break; - - case TGSI_OPCODE_ARA: - return 0; - break; - - case TGSI_OPCODE_ARR: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_rnd( func, 0, 0 ); - emit_f2it( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_BRA: - return 0; - break; - - case TGSI_OPCODE_CAL: - return 0; - break; - - case TGSI_OPCODE_RET: - emit_ret( func ); - break; - - case TGSI_OPCODE_END: - break; - - case TGSI_OPCODE_SSG: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_sgn( func, 0, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_CMP: - emit_cmp (func, inst); - break; - - case TGSI_OPCODE_SCS: - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0, 0 ); - STORE( func, *inst, 0, 0, CHAN_X ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { - FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0, 0 ); - STORE( func, *inst, 0, 0, CHAN_Y ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { - emit_tempf( - func, - 0, - TGSI_EXEC_TEMP_00000000_I, - TGSI_EXEC_TEMP_00000000_C ); - STORE( func, *inst, 0, 0, CHAN_Z ); - } - IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) { - emit_tempf( - func, - 0, - TEMP_ONE_I, - TEMP_ONE_C ); - STORE( func, *inst, 0, 0, CHAN_W ); - } - break; - - case TGSI_OPCODE_TXB: - emit_tex( func, inst, TRUE, FALSE ); - break; - - case TGSI_OPCODE_NRM: - /* fall-through */ - case TGSI_OPCODE_NRM4: - /* 3 or 4-component normalization */ - { - uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4; - - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || - IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || - IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) || - (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) { - - /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */ - - /* xmm4 = src.x */ - /* xmm0 = src.x * src.x */ - FETCH(func, *inst, 0, 0, CHAN_X); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { - emit_MOV(func, 4, 0); - } - emit_mul(func, 0, 0); - - /* xmm5 = src.y */ - /* xmm0 = xmm0 + src.y * src.y */ - FETCH(func, *inst, 1, 0, CHAN_Y); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { - emit_MOV(func, 5, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - - /* xmm6 = src.z */ - /* xmm0 = xmm0 + src.z * src.z */ - FETCH(func, *inst, 1, 0, CHAN_Z); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - emit_MOV(func, 6, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - - if (dims == 4) { - /* xmm7 = src.w */ - /* xmm0 = xmm0 + src.w * src.w */ - FETCH(func, *inst, 1, 0, CHAN_W); - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { - emit_MOV(func, 7, 1); - } - emit_mul(func, 1, 1); - emit_add(func, 0, 1); - } - - /* xmm1 = 1 / sqrt(xmm0) */ - emit_rsqrt(func, 1, 0); - - /* dst.x = xmm1 * src.x */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { - emit_mul(func, 4, 1); - STORE(func, *inst, 4, 0, CHAN_X); - } - - /* dst.y = xmm1 * src.y */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { - emit_mul(func, 5, 1); - STORE(func, *inst, 5, 0, CHAN_Y); - } - - /* dst.z = xmm1 * src.z */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { - emit_mul(func, 6, 1); - STORE(func, *inst, 6, 0, CHAN_Z); - } - - /* dst.w = xmm1 * src.w */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) { - emit_mul(func, 7, 1); - STORE(func, *inst, 7, 0, CHAN_W); - } - } - - /* dst0.w = 1.0 */ - if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) { - emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C); - STORE(func, *inst, 0, 0, CHAN_W); - } - } - break; - - case TGSI_OPCODE_DIV: - return 0; - break; - - case TGSI_OPCODE_DP2: - FETCH( func, *inst, 0, 0, CHAN_X ); /* xmm0 = src[0].x */ - FETCH( func, *inst, 1, 1, CHAN_X ); /* xmm1 = src[1].x */ - emit_mul( func, 0, 1 ); /* xmm0 = xmm0 * xmm1 */ - FETCH( func, *inst, 1, 0, CHAN_Y ); /* xmm1 = src[0].y */ - FETCH( func, *inst, 2, 1, CHAN_Y ); /* xmm2 = src[1].y */ - emit_mul( func, 1, 2 ); /* xmm1 = xmm1 * xmm2 */ - emit_add( func, 0, 1 ); /* xmm0 = xmm0 + xmm1 */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); /* dest[ch] = xmm0 */ - } - break; - - case TGSI_OPCODE_TXL: - return 0; - break; - - case TGSI_OPCODE_TXP: - emit_tex( func, inst, FALSE, TRUE ); - break; - - case TGSI_OPCODE_BRK: - return 0; - break; - - case TGSI_OPCODE_IF: - return 0; - break; - - case TGSI_OPCODE_ELSE: - return 0; - break; - - case TGSI_OPCODE_ENDIF: - return 0; - break; - - case TGSI_OPCODE_PUSHA: - return 0; - break; - - case TGSI_OPCODE_POPA: - return 0; - break; - - case TGSI_OPCODE_CEIL: - return 0; - break; - - case TGSI_OPCODE_I2F: - return 0; - break; - - case TGSI_OPCODE_NOT: - return 0; - break; - - case TGSI_OPCODE_TRUNC: - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - emit_f2it( func, 0 ); - emit_i2f( func, 0 ); - STORE( func, *inst, 0, 0, chan_index ); - } - break; - - case TGSI_OPCODE_SHL: - return 0; - break; - - case TGSI_OPCODE_ISHR: - return 0; - break; - - case TGSI_OPCODE_AND: - return 0; - break; - - case TGSI_OPCODE_OR: - return 0; - break; - - case TGSI_OPCODE_MOD: - return 0; - break; - - case TGSI_OPCODE_XOR: - return 0; - break; - - case TGSI_OPCODE_SAD: - return 0; - break; - - case TGSI_OPCODE_TXF: - return 0; - break; - - case TGSI_OPCODE_TXQ: - return 0; - break; - - case TGSI_OPCODE_CONT: - return 0; - break; - - case TGSI_OPCODE_EMIT: - return 0; - break; - - case TGSI_OPCODE_ENDPRIM: - return 0; - break; - - default: - return 0; - } - - return 1; -} - -static void -emit_declaration( - struct x86_function *func, - struct tgsi_full_declaration *decl ) -{ - if( decl->Declaration.File == TGSI_FILE_INPUT ) { - unsigned first, last, mask; - unsigned i, j; - - first = decl->Range.First; - last = decl->Range.Last; - mask = decl->Declaration.UsageMask; - - for( i = first; i <= last; i++ ) { - for( j = 0; j < NUM_CHANNELS; j++ ) { - if( mask & (1 << j) ) { - switch( decl->Declaration.Interpolate ) { - case TGSI_INTERPOLATE_CONSTANT: - emit_coef_a0( func, 0, i, j ); - emit_inputs( func, 0, i, j ); - break; - - case TGSI_INTERPOLATE_LINEAR: - emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); - emit_coef_dadx( func, 1, i, j ); - emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); - emit_coef_dady( func, 3, i, j ); - emit_mul( func, 0, 1 ); /* x * dadx */ - emit_coef_a0( func, 4, i, j ); - emit_mul( func, 2, 3 ); /* y * dady */ - emit_add( func, 0, 4 ); /* x * dadx + a0 */ - emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ - emit_inputs( func, 0, i, j ); - break; - - case TGSI_INTERPOLATE_PERSPECTIVE: - emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); - emit_coef_dadx( func, 1, i, j ); - emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); - emit_coef_dady( func, 3, i, j ); - emit_mul( func, 0, 1 ); /* x * dadx */ - emit_tempf( func, 4, 0, TGSI_SWIZZLE_W ); - emit_coef_a0( func, 5, i, j ); - emit_rcp( func, 4, 4 ); /* 1.0 / w */ - emit_mul( func, 2, 3 ); /* y * dady */ - emit_add( func, 0, 5 ); /* x * dadx + a0 */ - emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ - emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ - emit_inputs( func, 0, i, j ); - break; - - default: - assert( 0 ); - break; - } - } - } - } - } -} - -static void aos_to_soa( struct x86_function *func, - uint arg_aos, - uint arg_machine, - uint arg_num, - uint arg_stride ) -{ - struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX ); - struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX ); - struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg stride = x86_make_reg( file_REG32, reg_DX ); - int loop_top, loop_exit_fixup; - - /* Save EBX */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - - x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) ); - /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */ - x86_lea( func, soa_input, - x86_make_disp( soa_input, - Offset(struct tgsi_exec_machine, Inputs) ) ); - x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) ); - x86_mov( func, stride, x86_fn_arg( func, arg_stride ) ); - - /* while (num_inputs != 0) */ - loop_top = x86_get_label( func ); - x86_cmp_imm( func, num_inputs, 0 ); - loop_exit_fixup = x86_jcc_forward( func, cc_E ); - - { - x86_push( func, aos_input ); - sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) ); - sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) ); - sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) ); - sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) ); - x86_add( func, aos_input, stride ); - sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) ); - sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) ); - x86_pop( func, aos_input ); - - sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) ); - sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) ); - sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 ); - sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd ); - sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 ); - sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd ); - - sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) ); - sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) ); - sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) ); - sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) ); - - /* Advance to next input */ - x86_lea( func, aos_input, x86_make_disp(aos_input, 16) ); - x86_lea( func, soa_input, x86_make_disp(soa_input, 64) ); - } - /* --num_inputs */ - x86_dec( func, num_inputs ); - x86_jmp( func, loop_top ); - x86_fixup_fwd_jump( func, loop_exit_fixup ); - - /* Restore EBX */ - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); -} - -static void soa_to_aos( struct x86_function *func, - uint arg_aos, - uint arg_machine, - uint arg_num, - uint arg_stride ) -{ - struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX ); - struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX ); - struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg temp = x86_make_reg( file_REG32, reg_DX ); - int inner_loop; - - /* Save EBX */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - - x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) ); - x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) ); - /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */ - x86_lea( func, soa_output, - x86_make_disp( soa_output, - Offset(struct tgsi_exec_machine, Outputs) ) ); - x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) ); - - /* do */ - inner_loop = x86_get_label( func ); - { - sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) ); - sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) ); - sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) ); - sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) ); - - sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) ); - sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) ); - sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) ); - sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) ); - sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) ); - sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) ); - - x86_mov( func, temp, x86_fn_arg( func, arg_stride ) ); - x86_push( func, aos_output ); - sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); - sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); - x86_add( func, aos_output, temp ); - sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) ); - sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) ); - x86_add( func, aos_output, temp ); - sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) ); - sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) ); - x86_add( func, aos_output, temp ); - sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) ); - sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) ); - x86_pop( func, aos_output ); - - /* Advance to next output */ - x86_lea( func, aos_output, x86_make_disp(aos_output, 16) ); - x86_lea( func, soa_output, x86_make_disp(soa_output, 64) ); - } - /* while --num_outputs */ - x86_dec( func, num_outputs ); - x86_jcc( func, cc_NE, inner_loop ); - - /* Restore EBX */ - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); -} - - -/** - * Check if the instructions dst register is the same as any src - * register and warn if there's a posible SOA dependency. - */ -static boolean -check_soa_dependencies(const struct tgsi_full_instruction *inst) -{ - uint opcode = inst->Instruction.Opcode; - - /* XXX: we only handle src/dst aliasing in a few opcodes currently. - * Need to use an additional temporay to hold the result in the - * cases where the code is too opaque to fix. - */ - - switch (opcode) { - case TGSI_OPCODE_ADD: - case TGSI_OPCODE_MOV: - case TGSI_OPCODE_MUL: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DP2A: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_XPD: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXP: - case TGSI_OPCODE_NRM: - case TGSI_OPCODE_NRM4: - case TGSI_OPCODE_DP2: - /* OK - these opcodes correctly handle SOA dependencies */ - return TRUE; - default: - if (!tgsi_check_soa_dependencies(inst)) - return TRUE; - - debug_printf("Warning: src/dst aliasing in instruction" - " is not handled:\n"); - debug_printf("Warning: "); - tgsi_dump_instruction(inst, 1); - - return FALSE; - } -} - - -/** - * Translate a TGSI vertex/fragment shader to SSE2 code. - * Slightly different things are done for vertex vs. fragment shaders. - * - * \param tokens the TGSI input shader - * \param func the output SSE code/function - * \param immediates buffer to place immediates, later passed to SSE func - * \param return 1 for success, 0 if translation failed - */ -unsigned -tgsi_emit_sse2( - const struct tgsi_token *tokens, - struct x86_function *func, - float (*immediates)[4], - boolean do_swizzles ) -{ - struct tgsi_parse_context parse; - unsigned ok = 1; - uint num_immediates = 0; - - util_init_math(); - - func->csr = func->store; - - tgsi_parse_init( &parse, tokens ); - - /* Can't just use EDI, EBX without save/restoring them: - */ - x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); - x86_push( func, x86_make_reg( file_REG32, reg_DI ) ); - - /* - * Different function args for vertex/fragment shaders: - */ - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { - if (do_swizzles) - aos_to_soa( func, - 4, /* aos_input */ - 1, /* machine */ - 5, /* num_inputs */ - 6 ); /* input_stride */ - } - - x86_mov( - func, - get_machine_base(), - x86_fn_arg( func, 1 ) ); - x86_mov( - func, - get_const_base(), - x86_fn_arg( func, 2 ) ); - x86_mov( - func, - get_immediate_base(), - x86_fn_arg( func, 3 ) ); - - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - x86_mov( - func, - get_coef_base(), - x86_fn_arg( func, 4 ) ); - } - - x86_mov( - func, - get_sampler_base(), - x86_make_disp( get_machine_base(), - Offset( struct tgsi_exec_machine, Samplers ) ) ); - - while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { - tgsi_parse_token( &parse ); - - switch( parse.FullToken.Token.Type ) { - case TGSI_TOKEN_TYPE_DECLARATION: - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { - emit_declaration( - func, - &parse.FullToken.FullDeclaration ); - } - break; - - case TGSI_TOKEN_TYPE_INSTRUCTION: - ok = emit_instruction( - func, - &parse.FullToken.FullInstruction ); - - if (!ok) { - uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode; - uint proc = parse.FullHeader.Processor.Processor; - debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n", - opcode, - tgsi_get_opcode_name(opcode), - tgsi_get_processor_name(proc)); - } - - if (ok) - ok = check_soa_dependencies(&parse.FullToken.FullInstruction); - break; - - case TGSI_TOKEN_TYPE_IMMEDIATE: - /* simply copy the immediate values into the next immediates[] slot */ - { - const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; - uint i; - assert(size <= 4); - assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES); - for( i = 0; i < size; i++ ) { - immediates[num_immediates][i] = - parse.FullToken.FullImmediate.u[i].Float; - } -#if 0 - debug_printf("SSE FS immediate[%d] = %f %f %f %f\n", - num_immediates, - immediates[num_immediates][0], - immediates[num_immediates][1], - immediates[num_immediates][2], - immediates[num_immediates][3]); -#endif - num_immediates++; - } - break; - case TGSI_TOKEN_TYPE_PROPERTY: - /* we just ignore them for now */ - break; - - default: - ok = 0; - assert( 0 ); - } - } - - if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) { - if (do_swizzles) - soa_to_aos( func, - 7, /* aos_output */ - 1, /* machine */ - 8, /* num_outputs */ - 9 ); /* output_stride */ - } - - /* Can't just use EBX, EDI without save/restoring them: - */ - x86_pop( func, x86_make_reg( file_REG32, reg_DI ) ); - x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); - - emit_ret( func ); - - tgsi_parse_free( &parse ); - - return ok; -} - -#else /* !PIPE_ARCH_X86 */ - -unsigned -tgsi_emit_sse2( - const struct tgsi_token *tokens, - struct x86_function *func, - float (*immediates)[4], - boolean do_swizzles ) -{ - return 0; -} - -#endif /* !PIPE_ARCH_X86 */ |