summaryrefslogtreecommitdiffstats
path: root/src/gallium/auxiliary/tgsi
diff options
context:
space:
mode:
authorKeith Whitwell <[email protected]>2009-07-16 07:50:34 +0100
committerKeith Whitwell <[email protected]>2009-07-16 09:53:08 +0100
commitebc4a9bf2eff7d2c0d89785e865a1df23733e64b (patch)
treec56a007a7378cdf02f0c07c6d5c065f7bf469ad2 /src/gallium/auxiliary/tgsi
parent4e3002b50fcedf3a6db1ac7394077bc3337ccda1 (diff)
tgsi: reduce x86 reg usage in tgsi_sse generated programs
Pass the tgsi_exec_machine struct in directly and just hold a single pointer to this struct, rather than keeping one for each of its internal members.
Diffstat (limited to 'src/gallium/auxiliary/tgsi')
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c190
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.h28
2 files changed, 105 insertions, 113 deletions
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 5084befc4ef..cfe8ef0ecf1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -32,6 +32,7 @@
#include "util/u_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
+#include "util/u_memory.h"
#if defined(PIPE_ARCH_SSE)
#include "util/u_sse.h"
#endif
@@ -104,7 +105,7 @@ get_const_base( void )
}
static struct x86_reg
-get_input_base( void )
+get_machine_base( void )
{
return x86_make_reg(
file_REG32,
@@ -112,25 +113,35 @@ get_input_base( void )
}
static struct x86_reg
+get_input_base( void )
+{
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Inputs) );
+}
+
+static struct x86_reg
get_output_base( void )
{
- return x86_make_reg(
- file_REG32,
- reg_DX );
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Outputs) );
}
static struct x86_reg
get_temp_base( void )
{
- return x86_make_reg(
- file_REG32,
- reg_BX );
+ return x86_make_disp(
+ get_machine_base(),
+ Offset(struct tgsi_exec_machine, Temps) );
}
static struct x86_reg
get_coef_base( void )
{
- return get_output_base();
+ return x86_make_reg(
+ file_REG32,
+ reg_BX );
}
static struct x86_reg
@@ -138,7 +149,7 @@ get_immediate_base( void )
{
return x86_make_reg(
file_REG32,
- reg_DI );
+ reg_DX );
}
@@ -2551,7 +2562,7 @@ emit_declaration(
static void aos_to_soa( struct x86_function *func,
uint arg_aos,
- uint arg_soa,
+ uint arg_machine,
uint arg_num,
uint arg_stride )
{
@@ -2566,7 +2577,10 @@ static void aos_to_soa( struct x86_function *func,
x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
x86_mov( func, aos_input, x86_fn_arg( func, arg_aos ) );
- x86_mov( func, soa_input, x86_fn_arg( func, arg_soa ) );
+ x86_mov( func, soa_input, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_input,
+ x86_make_disp( soa_input,
+ Offset(struct tgsi_exec_machine, Inputs) ) );
x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
x86_mov( func, stride, x86_fn_arg( func, arg_stride ) );
@@ -2608,28 +2622,30 @@ static void aos_to_soa( struct x86_function *func,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_input );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
-static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+static void soa_to_aos( struct x86_function *func,
+ uint arg_aos,
+ uint arg_machine,
+ uint arg_num,
+ uint arg_stride )
{
- struct x86_reg soa_output;
- struct x86_reg aos_output;
- struct x86_reg num_outputs;
- struct x86_reg temp;
+ struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
+ struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
+ struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
+ struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
int inner_loop;
- soa_output = x86_make_reg( file_REG32, reg_AX );
- aos_output = x86_make_reg( file_REG32, reg_BX );
- num_outputs = x86_make_reg( file_REG32, reg_CX );
- temp = x86_make_reg( file_REG32, reg_DX );
-
/* Save EBX */
- x86_push( func, aos_output );
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
- x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
- x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
- x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+ x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
+ x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
+ x86_lea( func, soa_output,
+ x86_make_disp( soa_output,
+ Offset(struct tgsi_exec_machine, Outputs) ) );
+ x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
/* do */
inner_loop = x86_get_label( func );
@@ -2646,7 +2662,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
- x86_mov( func, temp, x86_fn_arg( func, stride ) );
+ x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
x86_push( func, aos_output );
sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
@@ -2670,20 +2686,13 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
x86_jcc( func, cc_NE, inner_loop );
/* Restore EBX */
- x86_pop( func, aos_output );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
}
/**
* Translate a TGSI vertex/fragment shader to SSE2 code.
* Slightly different things are done for vertex vs. fragment shaders.
*
- * Note that fragment shaders are responsible for interpolating shader
- * inputs. Because on x86 we have only 4 GP registers, and here we
- * have 5 shader arguments (input, output, const, temp and coef), the
- * code is split into two phases -- DECLARATION and INSTRUCTION phase.
- * GP register holding the output argument is aliased with the coeff
- * argument, as outputs are not needed in the DECLARATION phase.
- *
* \param tokens the TGSI input shader
* \param func the output SSE code/function
* \param immediates buffer to place immediates, later passed to SSE func
@@ -2697,7 +2706,6 @@ tgsi_emit_sse2(
boolean do_swizzles )
{
struct tgsi_parse_context parse;
- boolean instruction_phase = FALSE;
unsigned ok = 1;
uint num_immediates = 0;
@@ -2709,74 +2717,42 @@ tgsi_emit_sse2(
/* Can't just use EDI, EBX without save/restoring them:
*/
- x86_push(
- func,
- get_immediate_base() );
-
- x86_push(
- func,
- get_temp_base() );
-
+ x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+ x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
/*
* Different function args for vertex/fragment shaders:
*/
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- /* DECLARATION phase, do not load output argument. */
- x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- /* skipping outputs argument here */
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
- x86_mov(
- func,
- get_coef_base(),
- x86_fn_arg( func, 5 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 6 ) );
- }
- else {
- assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
-
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
aos_to_soa( func,
- 6, /* aos_input */
- 1, /* machine->input */
- 7, /* num_inputs */
- 8 ); /* input_stride */
+ 4, /* aos_input */
+ 1, /* machine */
+ 5, /* num_inputs */
+ 6 ); /* input_stride */
+ }
+ x86_mov(
+ func,
+ get_machine_base(),
+ x86_fn_arg( func, 1 ) );
+ x86_mov(
+ func,
+ get_const_base(),
+ x86_fn_arg( func, 2 ) );
+ x86_mov(
+ func,
+ get_immediate_base(),
+ x86_fn_arg( func, 3 ) );
+
+ if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
x86_mov(
- func,
- get_input_base(),
- x86_fn_arg( func, 1 ) );
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- x86_mov(
- func,
- get_const_base(),
- x86_fn_arg( func, 3 ) );
- x86_mov(
- func,
- get_temp_base(),
- x86_fn_arg( func, 4 ) );
- x86_mov(
- func,
- get_immediate_base(),
- x86_fn_arg( func, 5 ) );
+ func,
+ get_coef_base(),
+ x86_fn_arg( func, 4 ) );
}
+
while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
tgsi_parse_token( &parse );
@@ -2790,17 +2766,6 @@ tgsi_emit_sse2(
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
- if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
- if( !instruction_phase ) {
- /* INSTRUCTION phase, overwrite coeff with output. */
- instruction_phase = TRUE;
- x86_mov(
- func,
- get_output_base(),
- x86_fn_arg( func, 2 ) );
- }
- }
-
ok = emit_instruction(
func,
&parse.FullToken.FullInstruction );
@@ -2844,18 +2809,17 @@ tgsi_emit_sse2(
if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
if (do_swizzles)
- soa_to_aos( func, 9, 2, 10, 11 );
+ soa_to_aos( func,
+ 7, /* aos_output */
+ 1, /* machine */
+ 8, /* num_outputs */
+ 9 ); /* output_stride */
}
/* Can't just use EBX, EDI without save/restoring them:
*/
- x86_pop(
- func,
- get_temp_base() );
-
- x86_pop(
- func,
- get_immediate_base() );
+ x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
+ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
emit_ret( func );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
index af838b2a25b..d81ee3d00ec 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -34,6 +34,7 @@ extern "C" {
struct tgsi_token;
struct x86_function;
+struct tgsi_interp_coef;
unsigned
tgsi_emit_sse2(
@@ -42,6 +43,33 @@ tgsi_emit_sse2(
float (*immediates)[4],
boolean do_swizzles );
+
+/* This is the function prototype generated when do_swizzles is false
+ * -- effectively for fragment shaders.
+ */
+typedef void (PIPE_CDECL *tgsi_sse2_fs_function) (
+ struct tgsi_exec_machine *machine, /* 1 */
+ const float (*constant)[4], /* 2 */
+ const float (*immediate)[4], /* 3 */
+ const struct tgsi_interp_coef *coef /* 4 */
+ );
+
+
+/* This is the function prototype generated when do_swizzles is true
+ * -- effectively for vertex shaders.
+ */
+typedef void (PIPE_CDECL *tgsi_sse2_vs_func) (
+ struct tgsi_exec_machine *machine, /* 1 */
+ const float (*constant)[4], /* 2 */
+ const float (*immediate)[4], /* 3 */
+ const float (*aos_input)[4], /* 4 */
+ uint num_inputs, /* 5 */
+ uint input_stride, /* 6 */
+ float (*aos_output)[4], /* 7 */
+ uint num_outputs, /* 8 */
+ uint output_stride ); /* 9 */
+
+
#if defined __cplusplus
}
#endif