From 18fd3b757166c1c63284dc08f6dfd9e2061770be Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 14 Feb 2008 16:53:05 -0800
Subject: Cell: pass pointers to stored memory values

Several routines use shuffle patterns that are stored in memory.  For
code gen, it is difficult to directly access the data segments.  The
routines have been modified to be passed a pointer to a global table
of shuffle patterns.

This *should* be the last change to this file before switching over to
code gen.
---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 160 +++++++++++++++++------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   3 +-
 2 files changed, 98 insertions(+), 65 deletions(-)

(limited to 'src/mesa/pipe')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 3bbf9b7be4f..45e3c26c001 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -59,8 +59,59 @@
 
 #define DRAW_DBG 0
 
+static const qword fetch_shuffle_data[] = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+static INLINE void
+trans4x4(qword row0, qword row1, qword row2, qword row3, qword *out,
+         const qword *shuffle)
+{
+   qword t1 = si_shufb(row0, row2, shuffle[3]);
+   qword t2 = si_shufb(row0, row2, shuffle[4]);
+   qword t3 = si_shufb(row1, row3, shuffle[3]);
+   qword t4 = si_shufb(row1, row3, shuffle[4]);
+
+   out[0] = si_shufb(t1, t3, shuffle[3]);
+   out[1] = si_shufb(t1, t3, shuffle[4]);
+   out[2] = si_shufb(t2, t4, shuffle[3]);
+   out[3] = si_shufb(t2, t4, shuffle[4]);
+}
 
-static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
 
 /**
  * Fetch between 1 and 32 bytes from an unaligned address
@@ -100,140 +151,117 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 }
 
 
-#define CVT_32_FLOAT(q)    (*(q))
+#define CVT_32_FLOAT(q, s)    (*(q))
 
 static INLINE qword
-CVT_64_FLOAT(const qword *qw)
+CVT_64_FLOAT(const qword *qw, const qword *shuffle)
 {
-   qword shuf_first = (qword) {
-      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-   };
-
    qword a = si_frds(qw[0]);
    qword b = si_frds(si_rotqbyi(qw[0], 8));
    qword c = si_frds(qw[1]);
    qword d = si_frds(si_rotqbyi(qw[1], 8));
 
-   qword ab = si_shufb(a, b, shuf_first);
-   qword cd = si_shufb(c, d, si_rotqbyi(shuf_first, 8));
+   qword ab = si_shufb(a, b, shuffle[0]);
+   qword cd = si_shufb(c, d, si_rotqbyi(shuffle[0], 8));
    
    return si_or(ab, cd);
 }
 
 
 static INLINE qword
-CVT_8_USCALED(const qword *qw)
+CVT_8_USCALED(const qword *qw, const qword *shuffle)
 {
-   qword shuffle = (qword) {
-      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
-      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
-   };
-
-   return si_cuflt(si_shufb(*qw, *qw, shuffle), 0);
+   return si_cuflt(si_shufb(*qw, *qw, shuffle[1]), 0);
 }
 
 
 static INLINE qword
-CVT_16_USCALED(const qword *qw)
+CVT_16_USCALED(const qword *qw, const qword *shuffle)
 {
-   qword shuffle = (qword) {
-      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
-      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
-   };
-
-   return si_cuflt(si_shufb(*qw, *qw, shuffle), 0);
+   return si_cuflt(si_shufb(*qw, *qw, shuffle[2]), 0);
 }
 
 
 static INLINE qword
-CVT_32_USCALED(const qword *qw)
+CVT_32_USCALED(const qword *qw, const qword *shuffle)
 {
+   (void) shuffle;
    return si_cuflt(*qw, 0);
 }
 
 static INLINE qword
-CVT_8_SSCALED(const qword *qw)
+CVT_8_SSCALED(const qword *qw, const qword *shuffle)
 {
-   qword shuffle = (qword) {
-      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
-      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
-   };
-
-   return si_csflt(si_shufb(*qw, *qw, shuffle), 0);
+   return si_csflt(si_shufb(*qw, *qw, shuffle[1]), 0);
 }
 
 
 static INLINE qword
-CVT_16_SSCALED(const qword *qw)
+CVT_16_SSCALED(const qword *qw, const qword *shuffle)
 {
-   qword shuffle = (qword) {
-      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
-      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
-   };
-
-   return si_csflt(si_shufb(*qw, *qw, shuffle), 0);
+   return si_csflt(si_shufb(*qw, *qw, shuffle[2]), 0);
 }
 
 
 static INLINE qword
-CVT_32_SSCALED(const qword *qw)
+CVT_32_SSCALED(const qword *qw, const qword *shuffle)
 {
+   (void) shuffle;
    return si_csflt(*qw, 0);
 }
 
 
 static INLINE qword
-CVT_8_UNORM(const qword *qw)
+CVT_8_UNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 255.0f);
-   return si_fm(CVT_8_USCALED(qw), scale);
+   return si_fm(CVT_8_USCALED(qw, shuffle), scale);
 }
 
 
 static INLINE qword
-CVT_16_UNORM(const qword *qw)
+CVT_16_UNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 65535.0f);
-   return si_fm(CVT_16_USCALED(qw), scale);
+   return si_fm(CVT_16_USCALED(qw, shuffle), scale);
 }
 
 
 static INLINE qword
-CVT_32_UNORM(const qword *qw)
+CVT_32_UNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 4294967295.0f);
-   return si_fm(CVT_32_USCALED(qw), scale);
+   return si_fm(CVT_32_USCALED(qw, shuffle), scale);
 }
 
 
 static INLINE qword
-CVT_8_SNORM(const qword *qw)
+CVT_8_SNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 127.0f);
-   return si_fm(CVT_8_SSCALED(qw), scale);
+   return si_fm(CVT_8_SSCALED(qw, shuffle), scale);
 }
 
 
 static INLINE qword
-CVT_16_SNORM(const qword *qw)
+CVT_16_SNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 32767.0f);
-   return si_fm(CVT_16_SSCALED(qw), scale);
+   return si_fm(CVT_16_SSCALED(qw, shuffle), scale);
 }
 
 
 static INLINE qword
-CVT_32_SNORM(const qword *qw)
+CVT_32_SNORM(const qword *qw, const qword *shuffle)
 {
    const qword scale = (qword) spu_splats(1.0f / 2147483647.0f);
-   return si_fm(CVT_32_SSCALED(qw), scale);
+   return si_fm(CVT_32_SSCALED(qw, shuffle), scale);
 }
 
 #define SZ_4 si_il(0U)
-#define SZ_3 si_rotqmbyi(si_il(~0), -12)
-#define SZ_2 si_rotqmbyi(si_il(~0), -8)
-#define SZ_1 si_rotqmbyi(si_il(~0), -4)
+#define SZ_3 si_fsmbi(0x000f)
+#define SZ_2 si_fsmbi(0x00ff)
+#define SZ_1 si_fsmbi(0x0fff)
 
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
@@ -244,17 +272,19 @@ CVT_32_SNORM(const qword *qw)
  */
 #define FETCH_ATTRIB( NAME, SZ, CVT, N )			\
 static void							\
-fetch_##NAME(qword *out, const qword *in)			\
+fetch_##NAME(qword *out, const qword *in, qword defaults, \
+                const qword *shuffle)	\
 {								\
    qword tmp[4];						\
 								\
-   tmp[0] = si_selb(CVT(in + (0 * N)), (qword) defaults, SZ);	\
-   tmp[1] = si_selb(CVT(in + (1 * N)), (qword) defaults, SZ);	\
-   tmp[2] = si_selb(CVT(in + (2 * N)), (qword) defaults, SZ);	\
-   tmp[3] = si_selb(CVT(in + (3 * N)), (qword) defaults, SZ);	\
-   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) tmp);	\
+   tmp[0] = si_selb(CVT(in + (0 * N), shuffle), defaults, SZ);		\
+   tmp[1] = si_selb(CVT(in + (1 * N), shuffle), defaults, SZ);		\
+   tmp[2] = si_selb(CVT(in + (2 * N), shuffle), defaults, SZ);		\
+   tmp[3] = si_selb(CVT(in + (3 * N), shuffle), defaults, SZ);		\
+   trans4x4(tmp[0], tmp[1], tmp[2], tmp[3], out, shuffle);		\
 }
 
+
 FETCH_ATTRIB( R64G64B64A64_FLOAT,   SZ_4, CVT_64_FLOAT, 2 )
 FETCH_ATTRIB( R64G64B64_FLOAT,      SZ_3, CVT_64_FLOAT, 2 )
 FETCH_ATTRIB( R64G64_FLOAT,         SZ_2, CVT_64_FLOAT, 2 )
@@ -582,6 +612,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
+      const qword default_values = (qword)(vec_float4){ 0.0, 0.0, 0.0, 1.0 };
       const unsigned pitch = draw->vertex_fetch.pitch[attr];
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
@@ -602,8 +633,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          printf("SPU: fetching = 0x%llx\n", addr);
 #endif
 
-	 fetch_unaligned(& in[idx], addr, bytes_per_entry);
-	 idx += quads_per_entry;
+         fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
       }
 
       /* Be nice and zero out any missing vertices.
@@ -613,7 +644,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
       /* Convert all 4 vertices to vectors of float.
        */
-      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in);
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, default_values,
+               fetch_shuffle_data);
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index 8b37a239a47..b5bf31e67db 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,8 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(qword *out, const qword *in);
+typedef void (*spu_fetch_func)(qword *out, const qword *in, qword defaults,
+			       const qword *shuffle_data);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
-- 
cgit v1.2.3