gallium: Generate SSE code to swizzle and unswizzle vs inputs and outputs.

Change SSE_SWIZZLES #define to 0 to disable it.
author: Michal Krol <michal@ubuntu-vbox.(none)> 2008-04-28 18:50:27 +0200
committer: Michal Krol <michal@ubuntu-vbox.(none)> 2008-04-28 18:50:58 +0200
commit: 58d3dff0d3115ddd5397b7f77b5bcf4f9ca616b6 (patch)
tree: 03211346223ed2171d6269d0d6fccbb39bfa2230 /src/gallium/auxiliary
parent: 7f5e9d3f07f6fbfa699bef4ffff85fe0b557584a (diff)
6 files changed, 203 insertions, 14 deletions
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index f0763dad8d7..4ec20493c4c 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -109,9 +109,10 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vertex_shader;
    unsigned opt = fpme->opt;
+   unsigned alloc_count = align_int( fetch_count, 4 );
 
    struct vertex_header *pipeline_verts = 
-      (struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
 
    if (!pipeline_verts) {
       /* Not much we can do here - just skip the rendering.
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index b1e9f671147..07f85bc448f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -47,14 +47,29 @@
 #include "tgsi/util/tgsi_parse.h"
 
 #define SSE_MAX_VERTICES 4
+#define SSE_SWIZZLES 1
 
+#if SSE_SWIZZLES
+typedef void (XSTDCALL *codegen_function) (
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],
+   struct tgsi_exec_vector *temporary,
+   float (*immediates)[4],
+   const float (*aos_input)[4],
+   uint num_inputs,
+   uint input_stride,
+   float (*aos_output)[4],
+   uint num_outputs,
+   uint output_stride );
+#else
 typedef void (XSTDCALL *codegen_function) (
    const struct tgsi_exec_vector *input,
    struct tgsi_exec_vector *output,
    float (*constant)[4],
    struct tgsi_exec_vector *temporary,
    float (*immediates)[4] );
-
+#endif
 
 struct draw_sse_vertex_shader {
    struct draw_vertex_shader base;
@@ -91,12 +106,31 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 {
    struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
    struct tgsi_exec_machine *machine = shader->machine;
-   unsigned int i, j;
-   unsigned slot;
+   unsigned int i;
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
+#if SSE_SWIZZLES
+      /* run compiled shader
+       */
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   shader->immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+#else
+      unsigned int j, slot;
+
       /* Swizzle inputs.  
        */
       for (j = 0; j < max_vertices; j++) {
@@ -105,10 +139,10 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
             machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
             machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
-         }
+         } 
 
 	 input = (const float (*)[4])((const char *)input + input_stride);
-      } 
+      }
 
       /* run compiled shader
        */
@@ -118,7 +152,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 		   machine->Temps,
 		   shader->immediates);
 
-
       /* Unswizzle all output results.  
        */
       for (j = 0; j < max_vertices; j++) {
@@ -127,10 +160,11 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
             output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
             output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-         }
+         } 
 
 	 output = (float (*)[4])((char *)output + output_stride);
-      } 
+      }
+#endif
    }
 }
 
@@ -176,7 +210,7 @@ draw_create_vs_sse(struct draw_context *draw,
    x86_init_func( &vs->sse2_program );
 
    if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
-			&vs->sse2_program, vs->immediates )) 
+			&vs->sse2_program, vs->immediates, SSE_SWIZZLES )) 
       goto fail;
       
    vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index e6cbe9967fa..d7e22305573 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -853,6 +853,20 @@ void sse_shufps( struct x86_function *p,
    emit_1ub(p, shuf); 
 }
 
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x15 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x14 );
+   emit_modrm( p, dst, src );
+}
+
 void sse_cmpps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 1962b07bc5b..ad79b1facf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -203,6 +203,8 @@ void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                  unsigned char shuf );
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
 void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 9061e00b635..86ca16c246b 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -1788,7 +1788,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RET:
-   case TGSI_OPCODE_END:
 #ifdef WIN32
       emit_retw( func, 16 );
 #else
@@ -1796,6 +1795,9 @@ emit_instruction(
 #endif
       break;
 
+   case TGSI_OPCODE_END:
+      break;
+
    case TGSI_OPCODE_SSG:
       return 0;
       break;
@@ -2027,6 +2029,127 @@ emit_declaration(
    }
 }
 
+static void aos_to_soa( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_input;
+   struct x86_reg aos_input;
+   struct x86_reg num_inputs;
+   struct x86_reg temp;
+   unsigned char *inner_loop;
+
+   soa_input = x86_make_reg( file_REG32, reg_AX );
+   aos_input = x86_make_reg( file_REG32, reg_BX );
+   num_inputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, soa_input, get_argument( soa + 1 ) );
+   x86_mov( func, aos_input, get_argument( aos + 1 ) );
+   x86_mov( func, num_inputs, get_argument( num + 1 ) );
+
+   inner_loop = x86_get_label( func );
+
+   x86_mov( func, temp, get_argument( stride + 1 ) );
+   x86_push( func, aos_input );
+   sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+   sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+   sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+   sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+   sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+   x86_pop( func, aos_input );
+
+   sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+   sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+   sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+   sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+   sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+   sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+   sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+   sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+   sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+   sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+   /* Advance to next input */
+   x86_mov_reg_imm( func, temp, 16 );
+   x86_add( func, aos_input, temp );
+   x86_mov_reg_imm( func, temp, 64 );
+   x86_add( func, soa_input, temp );
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   unsigned char *inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, soa_output, get_argument( soa + 1 ) );
+   x86_mov( func, aos_output, get_argument( aos + 1 ) );
+   x86_mov( func, num_outputs, get_argument( num + 1 ) );
+
+   inner_loop = x86_get_label( func );
+
+   sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+   sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+   sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+   sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+   sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+   sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+   sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+   sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+   sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+   sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+   x86_mov( func, temp, get_argument( stride + 1 ) );
+   x86_push( func, aos_output );
+   sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+   sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+   x86_add( func, aos_output, temp );
+   sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+   sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+   x86_add( func, aos_output, temp );
+   sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+   sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+   x86_add( func, aos_output, temp );
+   sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+   sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+   x86_pop( func, aos_output );
+
+   /* Advance to next output */
+   x86_mov_reg_imm( func, temp, 16 );
+   x86_add( func, aos_output, temp );
+   x86_mov_reg_imm( func, temp, 64 );
+   x86_add( func, soa_output, temp );
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
 
 /**
  * Translate a TGSI vertex/fragment shader to SSE2 code.
@@ -2048,7 +2171,8 @@ unsigned
 tgsi_emit_sse2(
    const struct tgsi_token *tokens,
    struct x86_function *func,
-   float (*immediates)[4])
+   float (*immediates)[4],
+   boolean do_swizzles )
 {
    struct tgsi_parse_context parse;
    boolean instruction_phase = FALSE;
@@ -2089,6 +2213,9 @@ tgsi_emit_sse2(
    else {
       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
 
+      if (do_swizzles)
+         aos_to_soa( func, 5, 0, 6, 7 );
+
       x86_mov(
          func,
          get_input_base(),
@@ -2176,6 +2303,17 @@ tgsi_emit_sse2(
       }
    }
 
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 8, 1, 9, 10 );
+   }
+
+#ifdef WIN32
+   emit_retw( func, 16 );
+#else
+   emit_ret( func );
+#endif
+
    tgsi_parse_free( &parse );
 
    return ok;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index 063287dc5e9..e66d1152836 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -12,8 +12,8 @@ unsigned
 tgsi_emit_sse2(
    const struct tgsi_token *tokens,
    struct x86_function *function,
-   float (*immediates)[4]
- );
+   float (*immediates)[4],
+   boolean do_swizzles );
 
 #if defined __cplusplus
 }
author	Michal Krol <michal@ubuntu-vbox.(none)>	2008-04-28 18:50:27 +0200
committer	Michal Krol <michal@ubuntu-vbox.(none)>	2008-04-28 18:50:58 +0200
commit	58d3dff0d3115ddd5397b7f77b5bcf4f9ca616b6 (patch)
tree	03211346223ed2171d6269d0d6fccbb39bfa2230 /src/gallium/auxiliary
parent	7f5e9d3f07f6fbfa699bef4ffff85fe0b557584a (diff)