From 6c0fa798578ad247027dff861406a524821ddcdd Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 08:47:45 -0600
Subject: cell: setup fragment program inputs in SOA format

Also remove old code, etc.
---
 src/gallium/drivers/cell/spu/spu_tri.c | 112 ++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 56 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 8b938781920..b7faae6d60c 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
 }
 
 
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+{
+   eval_coeff(slot, x, y, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
-         /* texture mapping */
+         /*
+          * Temporary texture mapping path
+          * This will go away when fragment programs support TEX inst.
+          */
          const uint unit = 0;
+         vector float colors[4];
          vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
@@ -311,70 +327,54 @@ emit_quad( int x, int y, mask_t mask )
             colors[3] = spu_mul(colors[3], colors1[3]);
          }
 
-      }
-      else {
-         /* simple shading */
-#if 0
-         eval_coeff(1, (float) x, (float) y, colors);
-
-#else
-         /* XXX new fragment program code */
-
-         if (spu.fragment_program) {
-            vector float inputs[4*4], outputs[2*4];
-
-            /* setup inputs */
-            eval_coeff(1, (float) x, (float) y, inputs);
-
-            /* Execute the current fragment program */
-            spu.fragment_program(inputs, outputs, spu.constants);
-
-            /* Copy outputs */
-            colors[0] = outputs[0*4+0];
-            colors[1] = outputs[0*4+1];
-            colors[2] = outputs[0*4+2];
-            colors[3] = outputs[0*4+3];
-
-            if (0 && spu.init.id==0 && y == 48) {
-               printf("colors[0] = %f %f %f %f\n",
-                      spu_extract(colors[0], 0),
-                      spu_extract(colors[0], 1),
-                      spu_extract(colors[0], 2),
-                      spu_extract(colors[0], 3));
-               printf("colors[1] = %f %f %f %f\n",
-                      spu_extract(colors[1], 0),
-                      spu_extract(colors[1], 1),
-                      spu_extract(colors[1], 2),
-                      spu_extract(colors[1], 3));
-            }
-
+         {
+            /* Convert fragment data from AoS to SoA format.
+             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+             * This is temporary!
+             */
+            vector float soa_frag[4];
+            _transpose_matrix4x4(soa_frag, colors);
+
+            vector float fragZ = eval_z((float) x, (float) y);
+
+            /* Do all per-fragment/quad operations here, including:
+             * alpha test, z test, stencil test, blend and framebuffer writing.
+             */
+            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                             fragZ,
+                             soa_frag[0], soa_frag[1],
+                             soa_frag[2], soa_frag[3],
+                             mask);
          }
-#endif
-      }
-
 
-      {
-         /* Convert fragment data from AoS to SoA format.
-          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-          * This is temporary!
+      }
+      else {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-         vector float soa_frag[4];
-         _transpose_matrix4x4(soa_frag, colors);
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
 
-         float4 fragZ;
+         /* setup inputs */
+         eval_coeff_soa(1, (float) x, (float) y, inputs);
 
-         fragZ.v = eval_z((float) x, (float) y);
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
 
-         /* Do all per-fragment/quad operations here, including:
-          *  alpha test, z test, stencil test, blend and framebuffer writing.
+         /* Execute the current fragment program */
+         spu.fragment_program(inputs, outputs, spu.constants);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
           */
          spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                          fragZ.v,
-                          soa_frag[0], soa_frag[1],
-                          soa_frag[2], soa_frag[3],
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
                           mask);
       }
-
    }
 }
 
-- 
cgit v1.2.3


From e8b199c6e3386f8858adf43e5b15bf8ca0b8ce84 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 08:48:08 -0600
Subject: cell: implement swizzling for src regs

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6ffe94eb14a..d7a8846ab3a 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -137,11 +137,12 @@ get_src_reg(struct codegen *gen,
             const struct tgsi_full_src_register *src)
 {
    int reg;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
 
-   /* XXX need to examine src swizzle info here.
-    * That will involve changing the channel var...
-    */
+   assert(swizzle >= 0);
+   assert(swizzle <= 3);
 
+   channel = swizzle;
 
    switch (src->SrcRegister.File) {
    case TGSI_FILE_TEMPORARY:
-- 
cgit v1.2.3


From a449465556d47d83c2314a7ac711ca523378102b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 09:43:11 -0600
Subject: cell: fix non-debug build error

---
 src/gallium/drivers/cell/ppu/cell_context.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 71f1a3049d1..0a5c0baa471 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -85,13 +85,11 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
-#ifdef DEBUG
 static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {NULL, 0}
 };
-#endif
 
 
 struct pipe_context *
-- 
cgit v1.2.3


From 38bacb6f32d8a2cddc1116f7fbe2b21ea5a91a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 11:43:37 -0600
Subject: cell: implement colormask on fallback path

Also, some var renaming and additional comments
---
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c | 164 ++++++++++++++-------
 1 file changed, 110 insertions(+), 54 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 03dd547845b..f107764fb28 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector unsigned int mask)
 {
    vector float frag_aos[4];
-   unsigned int c0, c1, c2, c3;
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
 
-   /* do alpha test */
+   /*
+    * Do alpha test
+    */
    if (spu.depth_stencil_alpha.alpha.enabled) {
       vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
       vector unsigned int amask;
@@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y,
       mask = spu_and(mask, amask);
    }
 
-   /* Z and/or stencil testing... */
+
+   /*
+    * Z and/or stencil testing...
+    */
    if (spu.depth_stencil_alpha.depth.enabled ||
        spu.depth_stencil_alpha.stencil[0].enabled) {
 
@@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y,
       }
    }
 
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
    if (spu.blend.blend_enable) {
       /* blending terms, misc regs */
       vector float term1r, term1g, term1b, term1a;
@@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y,
 
       vector float fbRGBA[4];  /* current framebuffer colors */
 
-      /* get colors from framebuffer/tile */
+      /* convert framebuffer colors from packed int to vector float */
       {
-         vector float fc[4];
-         uint c0, c1, c2, c3;
-
-#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
-         c0 = colorTile->ui[y][x*2+0];
-         c1 = colorTile->ui[y][x*2+1];
-         c2 = colorTile->ui[y][x*2+2];
-         c3 = colorTile->ui[y][x*2+3];
-#else
-         c0 = colorTile->ui[y+0][x+0];
-         c1 = colorTile->ui[y+0][x+1];
-         c2 = colorTile->ui[y+1][x+0];
-         c3 = colorTile->ui[y+1][x+1];
-#endif
+         vector float temp[4]; /* float colors in AOS form */
          switch (spu.fb.color_format) {
          case PIPE_FORMAT_B8G8R8A8_UNORM:
-            fc[0] = spu_unpack_B8G8R8A8(c0);
-            fc[1] = spu_unpack_B8G8R8A8(c1);
-            fc[2] = spu_unpack_B8G8R8A8(c2);
-            fc[3] = spu_unpack_B8G8R8A8(c3);
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
             break;
          case PIPE_FORMAT_A8R8G8B8_UNORM:
-            fc[0] = spu_unpack_A8R8G8B8(c0);
-            fc[1] = spu_unpack_A8R8G8B8(c1);
-            fc[2] = spu_unpack_A8R8G8B8(c2);
-            fc[3] = spu_unpack_A8R8G8B8(c3);
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
             break;
          default:
             ASSERT(0);
          }
-         _transpose_matrix4x4(fbRGBA, fc);
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
       }
 
       /*
@@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y,
 #endif
 
    /*
-    * Pack float colors into 32-bit RGBA words.
+    * Pack fragment float colors into 32-bit RGBA words.
     */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
-      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
-      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
-      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
-      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
-      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
-      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
    default:
       fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
@@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y,
 
 
    /*
-    * Color masking
+    * Do color masking
     */
    if (spu.blend.colormask != 0xf) {
-      /* XXX to do */
-      /* apply color mask to 32-bit packed colors */
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & (1<<0))
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & (1<<1))
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & (1<<2))
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & (1<<3))
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
    }
 
 
    /*
-    * Logic Ops
+    * Do logic ops
     */
    if (spu.blend.logicop_enable) {
       /* XXX to do */
-      /* apply logicop to 32-bit packed colors */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
    }
 
 
@@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y,
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
    else {
+      /* write no fragments */
       return;
    }
 
 
    /*
-    * Write new quad colors to the framebuffer/tile.
+    * Write new fragment/quad colors to the framebuffer/tile.
     * Only write pixels where the corresponding mask word is set.
     */
 #if LINEAR_QUAD_LAYOUT
    /*
     * Quad layout:
     *  +--+--+--+--+
-    *  |p0|p1|p2|p3|
+    *  |p0|p1|p2|p3|...
     *  +--+--+--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y][x*2] = c0;
+      colorTile->ui[y][x*2] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y][x*2+1] = c1;
+      colorTile->ui[y][x*2+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y][x*2+2] = c2;
+      colorTile->ui[y][x*2+2] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y][x*2+3] = c3;
+      colorTile->ui[y][x*2+3] = fragc3;
 #else
    /*
     * Quad layout:
     *  +--+--+
-    *  |p0|p1|
+    *  |p0|p1|...
     *  +--+--+
-    *  |p2|p3|
+    *  |p2|p3|...
     *  +--+--+
     */
    if (spu_extract(mask, 0))
-      colorTile->ui[y+0][x+0] = c0;
+      colorTile->ui[y+0][x+0] = fragc0;
    if (spu_extract(mask, 1))
-      colorTile->ui[y+0][x+1] = c1;
+      colorTile->ui[y+0][x+1] = fragc1;
    if (spu_extract(mask, 2))
-      colorTile->ui[y+1][x+0] = c2;
+      colorTile->ui[y+1][x+0] = fragc2;
    if (spu_extract(mask, 3))
-      colorTile->ui[y+1][x+1] = c3;
+      colorTile->ui[y+1][x+1] = fragc3;
 #endif
 }
-- 
cgit v1.2.3


From d598a5d2301faea810a2449db7a32ff48e80b979 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 15:07:04 -0600
Subject: cell: disable invalid spe_release_func() calls, fixes crash on exit

---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 475c6ef0ce6..ea820aca744 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -72,7 +72,9 @@ cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_blend_state *cb = (struct cell_blend_state *) blend;
 
+#if 0
    spe_release_func(& cb->code);
+#endif
    FREE(cb);
 }
 
@@ -128,7 +130,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
    struct cell_depth_stencil_alpha_state *cdsa =
        (struct cell_depth_stencil_alpha_state *) depth;
 
+#if 0
    spe_release_func(& cdsa->code);
+#endif
    FREE(cdsa);
 }
 
-- 
cgit v1.2.3


From 5ab221549d5cdbf72817ff612464d83256765389 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 16:11:52 -0600
Subject: cell: evaluate multiple fragment inputs

---
 src/gallium/drivers/cell/spu/spu_tri.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index b7faae6d60c..0a8fb56a62c 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -356,8 +356,14 @@ emit_quad( int x, int y, mask_t mask )
          vector float fragZ = eval_z((float) x, (float) y);
 
          /* setup inputs */
+#if 0
          eval_coeff_soa(1, (float) x, (float) y, inputs);
-
+#else
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+         }
+#endif
          ASSERT(spu.fragment_program);
          ASSERT(spu.fragment_ops);
 
-- 
cgit v1.2.3


From af2ca5dc3823269636bfa8377ed971a761096b2e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 16:31:53 -0600
Subject: cell: initial support for IF/ELSE/ENDIF in fragment shader codegen

Only one level of if/else/endif nesting is currently working.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 193 ++++++++++++++++++++++++++---
 1 file changed, 175 insertions(+), 18 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d7a8846ab3a..8d8dfea0392 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -71,6 +71,11 @@ struct codegen
    int num_itemps;
    int itemps[3];
 
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Index of execution mask register */
+   int exec_mask_reg;
+
    struct spe_function *f;
    boolean error;
 };
@@ -112,18 +117,43 @@ get_const_one_reg(struct codegen *gen)
 {
    if (gen->one_reg <= 0) {
       gen->one_reg = spe_allocate_available_register(gen->f);
-   }
 
-   /* one = {1.0, 1.0, 1.0, 1.0} */
-   spe_load_float(gen->f, gen->one_reg, 1.0f);
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
 #if DISASSEM
-   printf("il\tr%d, 1.0f\n", gen->one_reg);
+      printf("\til\tr%d, 1.0f\n", gen->one_reg);
 #endif
+   }
 
    return gen->one_reg;
 }
 
 
+/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      /* exec_mask = {~0, ~0, ~0, ~0} */
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+#if DISASSEM
+      printf("INIT EXEC MASK:\n");
+      printf("\tload\tr%d, 0x%x\n", gen->exec_mask_reg, ~0);
+#endif
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
 /**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
@@ -136,7 +166,7 @@ get_src_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_src_register *src)
 {
-   int reg;
+   int reg = -1;
    int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
 
    assert(swizzle >= 0);
@@ -156,7 +186,7 @@ get_src_reg(struct codegen *gen,
          /* Load:  reg = memory[(machine_reg) + offset] */
          spe_lqd(gen->f, reg, gen->inputs_reg, offset);
 #if DISASSEM
-         printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
+         printf("\tlqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
 #endif
       }
       break;
@@ -184,11 +214,14 @@ get_dst_reg(struct codegen *gen,
             int channel,
             const struct tgsi_full_dst_register *dest)
 {
-   int reg;
+   int reg = -1;
 
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      if (gen->if_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
       break;
    case TGSI_FILE_OUTPUT:
       reg = get_itemp(gen);
@@ -214,17 +247,56 @@ store_dest_reg(struct codegen *gen,
 {
    switch (dest->DstRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      /* no-op */
+      if (gen->if_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+#if DISASSEM
+         printf("\tselb\tr%d, r%d, r%d, r%d  # EXEC MASK'ed\n",
+                d_reg, d_reg, value_reg, exec_reg);
+#endif
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+      }
       break;
    case TGSI_FILE_OUTPUT:
       {
          /* offset is measured in quadwords, not bytes */
          int offset = dest->DstRegister.Index * 4 + channel;
-         /* Store: memory[(machine_reg) + offset] = reg */
-         spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+         if (gen->if_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
 #if DISASSEM
-         printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset);
+            printf("\tlqd\tr%d, r%d + %d\n",
+                   curval_reg, gen->outputs_reg, offset);
+            printf("\tselb\tr%d, r%d, r%d, r%d  # EXEC MASK'ed\n",
+                   curval_reg, curval_reg, value_reg, exec_reg);
+            printf("\tstqd\tr%d, r%d + %d\n",
+                   curval_reg, gen->outputs_reg, offset);
 #endif
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+#if DISASSEM
+            printf("\tstqd\tr%d, r%d + %d\n",
+                   value_reg, gen->outputs_reg, offset);
+#endif
+         }
       }
       break;
    default:
@@ -237,6 +309,9 @@ static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+#if DISASSEM
+   printf("MOV:\n");
+#endif
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -244,7 +319,7 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
          /* XXX we don't always need to actually emit a mov instruction here */
          spe_move(gen->f, dst_reg, src_reg);
 #if DISASSEM
-         printf("mov\tr%d, r%d\n", dst_reg, src_reg);
+         printf("\tmov\tr%d, r%d\n", dst_reg, src_reg);
 #endif
          store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -254,6 +329,7 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+
 /**
  * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
  * becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -263,6 +339,9 @@ static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+#if DISASSEM
+   printf("ADD:\n");
+#endif
    /* Loop over Red/Green/Blue/Alpha channels */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
@@ -275,7 +354,7 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
          /* Emit actual SPE instruction: d = s1 + s2 */
          spe_fa(gen->f, d_reg, s1_reg, s2_reg);
 #if DISASSEM
-         printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+         printf("\tfa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
 #endif
 
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
@@ -295,6 +374,9 @@ static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+#if DISASSEM
+   printf("MUL:\n");
+#endif
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -303,7 +385,7 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
          /* d = s1 * s2 */
          spe_fm(gen->f, d_reg, s1_reg, s2_reg);
 #if DISASSEM
-         printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+         printf("\tfm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
 #endif
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -324,6 +406,9 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
+#if DISASSEM
+   printf("SGT:\n");
+#endif
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -333,14 +418,14 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
          /* d = (s1 > s2) */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
 #if DISASSEM
-         printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
+         printf("\tfcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
 #endif
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
          /* d = d & one_reg */
          spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
 #if DISASSEM
-         printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
+         printf("\tand\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
 #endif
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -352,6 +437,71 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   /* update execution mask with the predicate register */
+   int tmp_reg = spe_allocate_available_register(gen->f);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg);
+   /* exec_mask = exec_mask & tmp */
+   spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+#if DISASSEM
+   printf("IF:\n");
+   printf("\tseqi\tr%d, r%d, 0;\n", tmp_reg, s1_reg);
+   printf("\tcomp\tr%d\n", tmp_reg);
+   printf("\tand\tr%d, r%d, r%d\n", exec_reg, exec_reg, tmp_reg);
+#endif
+
+   gen->if_nesting++;
+
+   spe_release_register(gen->f, tmp_reg);
+
+   return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   /* exec_mask = !exec_mask */
+   spe_complement(gen->f, exec_reg);
+#if DISASSEM
+   printf("ELSE:\n");
+   printf("\tcomp\tr%d;\n", exec_reg);
+#endif
+   return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   /* XXX todo: pop execution mask */
+
+   spe_load_int(gen->f, exec_reg, ~0x0);
+#if DISASSEM
+   printf("ENDIF:\n");
+   printf("\tli\tr%d, ~0x0\n", exec_reg);
+#endif
+
+   gen->if_nesting--;
+   return true;
+}
+
+
+
 /**
  * Emit END instruction.
  * We just return from the shader function at this point.
@@ -365,7 +515,7 @@ emit_END(struct codegen *gen)
    /* return from function call */
    spe_bi(gen->f, SPE_REG_RA, 0, 0);
 #if DISASSEM
-   printf("bi\trRA\n");
+   printf("\tbi\trRA\n");
 #endif
    return true;
 }
@@ -390,6 +540,13 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
    /* XXX lots more cases to do... */
 
    default:
-- 
cgit v1.2.3


From 5f3ec823385b34b8db6013fdf701c5522dc86524 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 17:10:20 -0600
Subject: cell: implement TGSI immediates in SPE code generator

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 75 +++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 12 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8d8dfea0392..33579fc7033 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -52,7 +52,11 @@
 
 
 /** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 01
+#define DISASSEM 0
+
+
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
 
 
 /**
@@ -63,7 +67,10 @@ struct codegen
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
-   int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
 
    int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
 
@@ -191,7 +198,8 @@ get_src_reg(struct codegen *gen,
       }
       break;
    case TGSI_FILE_IMMEDIATE:
-      /* xxx fall-through for now / fix */
+      reg = gen->imm_regs[src->SrcRegister.Index][channel];
+      break;
    case TGSI_FILE_CONSTANT:
       /* xxx fall-through for now / fix */
    default:
@@ -558,12 +566,53 @@ emit_instruction(struct codegen *gen,
 
 
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+#if DISASSEM
+   printf("IMMEDIATE %d:\n", gen->num_imm);
+#endif
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+      int reg = spe_allocate_available_register(gen->f);
+
+      if (reg < 0)
+         return false;
+
+      /* update immediate map */
+      gen->imm_regs[gen->num_imm][ch] = reg;
+
+      /* emit initializer instruction */
+      spe_load_float(gen->f, reg, val);
+#if DISASSEM
+      printf("\tload\tr%d, %f\n", reg, val);
+#endif
+   }
+
+   gen->num_imm++;
+
+   return true;
+}
+
+
+
 /**
  * Emit "code" for a TGSI declaration.
  * We only care about TGSI TEMPORARY register declarations at this time.
  * For each TGSI TEMPORARY we allocate four SPE registers.
  */
-static void
+static boolean
 emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
 {
    int i, ch;
@@ -578,8 +627,11 @@ emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
+         assert(i < MAX_TEMPS);
          for (ch = 0; ch < 4; ch++) {
             gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return false; /* out of regs */
          }
 
          /* XXX if we run out of SPE registers, we need to spill
@@ -598,6 +650,8 @@ emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
    default:
       ; /* ignore */
    }
+
+   return true;
 }
 
 
@@ -642,25 +696,22 @@ cell_gen_fragment_program(struct cell_context *cell,
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
-         if (!note_immediate(&gen, &parse.FullToken.FullImmediate ))
-            goto fail;
-#endif
+         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_DECLARATION:
-         emit_declaration(&gen, &parse.FullToken.FullDeclaration);
+         if (!emit_declaration(&gen, &parse.FullToken.FullDeclaration))
+            gen.error = true;
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) {
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
             gen.error = true;
-         }
          break;
 
       default:
          assert(0);
-
       }
    }
 
-- 
cgit v1.2.3


From 6f3eee921327ce76c05620eec714f2ff4f500826 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:09:10 -0600
Subject: cell: implement DDX/DDY codegen (untested)

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 33579fc7033..a7b7dd03d37 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -509,6 +509,38 @@ emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         int t1_reg = get_itemp(gen);
+         int t2_reg = get_itemp(gen);
+
+         spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+         if (ddx) {
+            spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+         }
+         else {
+            spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+         }
+         spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+
 
 /**
  * Emit END instruction.
@@ -555,6 +587,11 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_ENDIF:
       return emit_ENDIF(gen, inst);
 
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, true);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, false);
+
    /* XXX lots more cases to do... */
 
    default:
-- 
cgit v1.2.3


From 8d768c51018841b66dbed87ae6b50358e53ad2c4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:54:25 -0600
Subject: cell: remove old disassembly/dump code; use dumper code in SPE
 emitter.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 104 ++++++++---------------------
 1 file changed, 28 insertions(+), 76 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index a7b7dd03d37..0712d05b40b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -125,11 +125,13 @@ get_const_one_reg(struct codegen *gen)
    if (gen->one_reg <= 0) {
       gen->one_reg = spe_allocate_available_register(gen->f);
 
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
       /* one = {1.0, 1.0, 1.0, 1.0} */
       spe_load_float(gen->f, gen->one_reg, 1.0f);
-#if DISASSEM
-      printf("\til\tr%d, 1.0f\n", gen->one_reg);
-#endif
+
+      spe_indent(gen->f, -4);
    }
 
    return gen->one_reg;
@@ -149,12 +151,13 @@ get_exec_mask_reg(struct codegen *gen)
    if (gen->exec_mask_reg <= 0) {
       gen->exec_mask_reg = spe_allocate_available_register(gen->f);
 
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
       /* exec_mask = {~0, ~0, ~0, ~0} */
       spe_load_int(gen->f, gen->exec_mask_reg, ~0);
-#if DISASSEM
-      printf("INIT EXEC MASK:\n");
-      printf("\tload\tr%d, 0x%x\n", gen->exec_mask_reg, ~0);
-#endif
+
+      spe_indent(gen->f, -4);
    }
 
    return gen->exec_mask_reg;
@@ -192,9 +195,6 @@ get_src_reg(struct codegen *gen,
          reg = get_itemp(gen);
          /* Load:  reg = memory[(machine_reg) + offset] */
          spe_lqd(gen->f, reg, gen->inputs_reg, offset);
-#if DISASSEM
-         printf("\tlqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset);
-#endif
       }
       break;
    case TGSI_FILE_IMMEDIATE:
@@ -262,10 +262,6 @@ store_dest_reg(struct codegen *gen,
           * d[i] = mask_reg[i] ? value_reg : d_reg
           */
          spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
-#if DISASSEM
-         printf("\tselb\tr%d, r%d, r%d, r%d  # EXEC MASK'ed\n",
-                d_reg, d_reg, value_reg, exec_reg);
-#endif
       }
       else {
          /* we're not inside a condition or loop: do nothing special */
@@ -288,22 +284,10 @@ store_dest_reg(struct codegen *gen,
             spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
             /* Store: memory[(machine_reg) + offset] = curval */
             spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
-#if DISASSEM
-            printf("\tlqd\tr%d, r%d + %d\n",
-                   curval_reg, gen->outputs_reg, offset);
-            printf("\tselb\tr%d, r%d, r%d, r%d  # EXEC MASK'ed\n",
-                   curval_reg, curval_reg, value_reg, exec_reg);
-            printf("\tstqd\tr%d, r%d + %d\n",
-                   curval_reg, gen->outputs_reg, offset);
-#endif
          }
          else {
             /* Store: memory[(machine_reg) + offset] = reg */
             spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
-#if DISASSEM
-            printf("\tstqd\tr%d, r%d + %d\n",
-                   value_reg, gen->outputs_reg, offset);
-#endif
          }
       }
       break;
@@ -317,18 +301,13 @@ static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-#if DISASSEM
-   printf("MOV:\n");
-#endif
+   spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
          /* XXX we don't always need to actually emit a mov instruction here */
          spe_move(gen->f, dst_reg, src_reg);
-#if DISASSEM
-         printf("\tmov\tr%d, r%d\n", dst_reg, src_reg);
-#endif
          store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -347,9 +326,7 @@ static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-#if DISASSEM
-   printf("ADD:\n");
-#endif
+   spe_comment(gen->f, -4, "ADD:");
    /* Loop over Red/Green/Blue/Alpha channels */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
@@ -361,9 +338,6 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* Emit actual SPE instruction: d = s1 + s2 */
          spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("\tfa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
 
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -382,9 +356,7 @@ static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-#if DISASSEM
-   printf("MUL:\n");
-#endif
+   spe_comment(gen->f, -4, "MUL:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -392,9 +364,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
          /* d = s1 * s2 */
          spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("\tfm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -414,9 +383,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
-#if DISASSEM
-   printf("SGT:\n");
-#endif
+   spe_comment(gen->f, -4, "SGT:");
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -425,16 +393,10 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* d = (s1 > s2) */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-#if DISASSEM
-         printf("\tfcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg);
-#endif
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
          /* d = d & one_reg */
          spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
-#if DISASSEM
-         printf("\tand\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen));
-#endif
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -451,6 +413,8 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    const int channel = 0;
    const int exec_reg = get_exec_mask_reg(gen);
 
+   spe_comment(gen->f, -4, "IF:");
+
    /* update execution mask with the predicate register */
    int tmp_reg = spe_allocate_available_register(gen->f);
    int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
@@ -461,12 +425,6 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_complement(gen->f, tmp_reg);
    /* exec_mask = exec_mask & tmp */
    spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
-#if DISASSEM
-   printf("IF:\n");
-   printf("\tseqi\tr%d, r%d, 0;\n", tmp_reg, s1_reg);
-   printf("\tcomp\tr%d\n", tmp_reg);
-   printf("\tand\tr%d, r%d, r%d\n", exec_reg, exec_reg, tmp_reg);
-#endif
 
    gen->if_nesting++;
 
@@ -481,12 +439,11 @@ emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    const int exec_reg = get_exec_mask_reg(gen);
 
+   spe_comment(gen->f, -4, "ELSE:");
+
    /* exec_mask = !exec_mask */
    spe_complement(gen->f, exec_reg);
-#if DISASSEM
-   printf("ELSE:\n");
-   printf("\tcomp\tr%d;\n", exec_reg);
-#endif
+
    return true;
 }
 
@@ -496,13 +453,11 @@ emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    const int exec_reg = get_exec_mask_reg(gen);
 
+   spe_comment(gen->f, -4, "ENDIF:");
+
    /* XXX todo: pop execution mask */
 
    spe_load_int(gen->f, exec_reg, ~0x0);
-#if DISASSEM
-   printf("ENDIF:\n");
-   printf("\tli\tr%d, ~0x0\n", exec_reg);
-#endif
 
    gen->if_nesting--;
    return true;
@@ -515,6 +470,8 @@ emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
 {
    int ch;
 
+   spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -552,11 +509,9 @@ emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
 static boolean
 emit_END(struct codegen *gen)
 {
+   spe_comment(gen->f, -4, "END:");
    /* return from function call */
    spe_bi(gen->f, SPE_REG_RA, 0, 0);
-#if DISASSEM
-   printf("\tbi\trRA\n");
-#endif
    return true;
 }
 
@@ -616,9 +571,7 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
 
    assert(gen->num_imm < MAX_TEMPS);
 
-#if DISASSEM
-   printf("IMMEDIATE %d:\n", gen->num_imm);
-#endif
+   spe_comment(gen->f, -4, "IMMEDIATE:");
 
    for (ch = 0; ch < 4; ch++) {
       float val = immed->u.ImmediateFloat32[ch].Float;
@@ -632,9 +585,6 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
 
       /* emit initializer instruction */
       spe_load_float(gen->f, reg, val);
-#if DISASSEM
-      printf("\tload\tr%d, %f\n", reg, val);
-#endif
    }
 
    gen->num_imm++;
@@ -722,6 +672,8 @@ cell_gen_fragment_program(struct cell_context *cell,
    spe_allocate_register(f, gen.constants_reg);
 
 #if DISASSEM
+   spe_print_code(f, true);
+   spe_indent(f, 8);
    printf("Begin %s\n", __FUNCTION__);
    tgsi_dump(tokens, 0);
 #endif
-- 
cgit v1.2.3


From 777aca8fc99986dacf043cc3c25911df4252cb42 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Sat, 13 Sep 2008 15:32:46 -0600
Subject: cell: implement negation, absolute value and set-sign for src regs in
 code gen

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 44 ++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 0712d05b40b..8d8c095a7e2 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -178,6 +178,8 @@ get_src_reg(struct codegen *gen,
 {
    int reg = -1;
    int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
 
    assert(swizzle >= 0);
    assert(swizzle <= 3);
@@ -193,6 +195,7 @@ get_src_reg(struct codegen *gen,
          /* offset is measured in quadwords, not bytes */
          int offset = src->SrcRegister.Index * 4 + channel;
          reg = get_itemp(gen);
+         reg_is_itemp = TRUE;
          /* Load:  reg = memory[(machine_reg) + offset] */
          spe_lqd(gen->f, reg, gen->inputs_reg, offset);
       }
@@ -206,6 +209,43 @@ get_src_reg(struct codegen *gen,
       assert(0);
    }
 
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
    return reg;
 }
 
@@ -416,7 +456,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_comment(gen->f, -4, "IF:");
 
    /* update execution mask with the predicate register */
-   int tmp_reg = spe_allocate_available_register(gen->f);
+   int tmp_reg = get_itemp(gen);
    int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
 
    /* tmp = (s1_reg == 0) */
@@ -428,7 +468,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    gen->if_nesting++;
 
-   spe_release_register(gen->f, tmp_reg);
+   free_itemps(gen);
 
    return true;
 }
-- 
cgit v1.2.3


From 5a4ab148a76f6c6d33b9784f99531a6bf2d9101b Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 15 Sep 2008 11:56:51 -0600
Subject: Added support for SUB and MAD instructions

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 61 ++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8d8c095a7e2..9eae57bb76b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -52,7 +52,7 @@
 
 
 /** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 0
+#define DISASSEM 1
 
 
 #define MAX_TEMPS 16
@@ -76,7 +76,7 @@ struct codegen
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[3];
+   int itemps[4];
 
    /** Current IF/ELSE/ENDIF nesting level */
    int if_nesting;
@@ -388,6 +388,58 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit subtract.  See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "SUB:");
+   /* Loop over Red/Green/Blue/Alpha channels */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* get indexes of the two src, one dest SPE registers */
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Emit actual SPE instruction: d = s1 - s2 */
+         spe_fs(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "MUL:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s1 * s2 + s3 */
+         spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
 
 /**
  * Emit multiply.  See emit_ADD for comments.
@@ -411,7 +463,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
-
 /**
  * Emit set-if-greater-than.
  * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
@@ -570,6 +621,10 @@ emit_instruction(struct codegen *gen,
       return emit_MUL(gen, inst);
    case TGSI_OPCODE_ADD:
       return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
    case TGSI_OPCODE_SGT:
       return emit_SGT(gen, inst);
    case TGSI_OPCODE_END:
-- 
cgit v1.2.3


From 0a75773fed3f2d74d697fae5aee9ae8f18298631 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 15 Sep 2008 12:27:10 -0600
Subject: cell: Added support for ABS instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 31 +++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 9eae57bb76b..33f3c74b569 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -424,7 +424,7 @@ static boolean
 emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-   spe_comment(gen->f, -4, "MUL:");
+   spe_comment(gen->f, -4, "MAD:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -463,6 +463,33 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "ABS:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         const int bit31mask_reg = get_itemp(gen);
+
+         /* mask with bit 31 set, the rest cleared */  
+         spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+
+         /* d = sign bit cleared in s1 */
+         spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
 /**
  * Emit set-if-greater-than.
  * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
@@ -625,6 +652,8 @@ emit_instruction(struct codegen *gen,
       return emit_SUB(gen, inst);
    case TGSI_OPCODE_MAD:
       return emit_MAD(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
       return emit_SGT(gen, inst);
    case TGSI_OPCODE_END:
-- 
cgit v1.2.3


From 81aa90e8837128423e37a776cdfbf63b0604903f Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 15 Sep 2008 13:45:09 -0600
Subject: cell: Added support for SLT, SEQ and SNE instructions

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 100 +++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 33f3c74b569..c48200d5ccc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -524,6 +524,100 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit set-if_less-then.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SEQ:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 == s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_not_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SNE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
 
 static boolean
 emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
@@ -656,6 +750,12 @@ emit_instruction(struct codegen *gen,
       return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
       return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SEQ:
+      return emit_SEQ(gen, inst);
+   case TGSI_OPCODE_SNE:
+      return emit_SNE(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
-- 
cgit v1.2.3


From e67374b6b2f6fd846c368ec70e80f0f4cf508f97 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 15 Sep 2008 15:45:51 -0600
Subject: cell: Added LERP instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 92 +++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index c48200d5ccc..7a672478c5d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -52,7 +52,7 @@
 
 
 /** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 1
+#define DISASSEM 0
 
 
 #define MAX_TEMPS 16
@@ -441,6 +441,31 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "LERP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = s3 + s1(s2 - s3) */
+         spe_fs(gen->f, d_reg, s2_reg, s3_reg);
+         spe_fm(gen->f, d_reg, d_reg, s1_reg);
+         spe_fa(gen->f, d_reg, d_reg, s3_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
 /**
  * Emit multiply.  See emit_ADD for comments.
  */
@@ -618,6 +643,65 @@ emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MAX:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) ? s1 : s2 */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+         spe_and(gen->f, d_reg, d_reg, s1_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+         spe_and(gen->f, d_reg, d_reg, s2_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "MIN:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) ? s1 : s2 */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+         spe_and(gen->f, d_reg, d_reg, s1_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+         spe_and(gen->f, d_reg, d_reg, s2_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
 
 static boolean
 emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
@@ -746,6 +830,8 @@ emit_instruction(struct codegen *gen,
       return emit_SUB(gen, inst);
    case TGSI_OPCODE_MAD:
       return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
    case TGSI_OPCODE_ABS:
       return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
@@ -756,6 +842,10 @@ emit_instruction(struct codegen *gen,
       return emit_SEQ(gen, inst);
    case TGSI_OPCODE_SNE:
       return emit_SNE(gen, inst);
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(gen, inst);
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
-- 
cgit v1.2.3


From 32250eb959b1355b2f6984ea892a86a6ecf9d3c3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 15 Sep 2008 19:38:39 -0600
Subject: cell: export CELL_DEBUG=asm to dump SPU assembly code

---
 src/gallium/drivers/cell/common.h           |  3 +-
 src/gallium/drivers/cell/ppu/cell_context.c |  1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c  | 56 ++++++++++++++---------------
 3 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index cb0631baf52..8f088541173 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -106,7 +106,8 @@
 
 
 #define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_SYNC     (1 << 1)
+#define CELL_DEBUG_ASM      (1 << 1)
+#define CELL_DEBUG_SYNC     (1 << 2)
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 0a5c0baa471..b418271dca2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -87,6 +87,7 @@ cell_draw_create(struct cell_context *cell)
 
 static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
    {NULL, 0}
 };
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 7a672478c5d..98ee5af2790 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -51,10 +51,6 @@
 #include "cell_gen_fp.h"
 
 
-/** Set to 1 to enable debug/disassembly printfs */
-#define DISASSEM 0
-
-
 #define MAX_TEMPS 16
 #define MAX_IMMED  8
 
@@ -864,6 +860,8 @@ emit_instruction(struct codegen *gen,
    /* XXX lots more cases to do... */
 
    default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
       return false;
    }
 
@@ -914,17 +912,19 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
  * For each TGSI TEMPORARY we allocate four SPE registers.
  */
 static boolean
-emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
 {
    int i, ch;
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-#if DISASSEM
-      printf("Declare temp reg %d .. %d\n",
-             decl->DeclarationRange.First,
-             decl->DeclarationRange.Last);
-#endif
+      if (cell->debug_flags & CELL_DEBUG_ASM) {
+         printf("Declare temp reg %d .. %d\n",
+                decl->DeclarationRange.First,
+                decl->DeclarationRange.Last);
+      }
+
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
@@ -939,13 +939,13 @@ emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl)
           * to SPU memory.  someday...
           */
 
-#if DISASSEM
-         printf("  SPE regs: %d %d %d %d\n",
-                gen->temp_regs[i][0],
-                gen->temp_regs[i][1],
-                gen->temp_regs[i][2],
-                gen->temp_regs[i][3]);
-#endif
+         if (cell->debug_flags & CELL_DEBUG_ASM) {
+            printf("  SPE regs: %d %d %d %d\n",
+                   gen->temp_regs[i][0],
+                   gen->temp_regs[i][1],
+                   gen->temp_regs[i][2],
+                   gen->temp_regs[i][3]);
+         }
       }
       break;
    default:
@@ -985,12 +985,12 @@ cell_gen_fragment_program(struct cell_context *cell,
    spe_allocate_register(f, gen.outputs_reg);
    spe_allocate_register(f, gen.constants_reg);
 
-#if DISASSEM
-   spe_print_code(f, true);
-   spe_indent(f, 8);
-   printf("Begin %s\n", __FUNCTION__);
-   tgsi_dump(tokens, 0);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
 
    tgsi_parse_init(&parse, tokens);
 
@@ -1004,7 +1004,7 @@ cell_gen_fragment_program(struct cell_context *cell,
          break;
 
       case TGSI_TOKEN_TYPE_DECLARATION:
-         if (!emit_declaration(&gen, &parse.FullToken.FullDeclaration))
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
             gen.error = true;
          break;
 
@@ -1024,10 +1024,10 @@ cell_gen_fragment_program(struct cell_context *cell,
       return emit_END(&gen);
    }
 
-#if DISASSEM
-   printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
-   printf("End %s\n", __FUNCTION__);
-#endif
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
 
    tgsi_parse_free( &parse );
 
-- 
cgit v1.2.3


From 2c54a6ee798ae22f92ef1fc4a1658ec5e701388a Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 16 Sep 2008 09:36:38 -0600
Subject: cell: Fixed MIN/MAX algorithm

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 98ee5af2790..612749507be 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -657,9 +657,7 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* d = (s1 > s2) ? s1 : s2 */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_and(gen->f, d_reg, d_reg, s1_reg);
-         spe_nor(gen->f, d_reg, d_reg, d_reg);
-         spe_and(gen->f, d_reg, d_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -685,11 +683,9 @@ emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* d = (s1 < s2) ? s1 : s2 */
+         /* d = (s2 > s1) ? s1 : s2 */
          spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-         spe_and(gen->f, d_reg, d_reg, s1_reg);
-         spe_nor(gen->f, d_reg, d_reg, d_reg);
-         spe_and(gen->f, d_reg, d_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
-- 
cgit v1.2.3


From dd75ca89ebce58a69da20c1efbf2a53575b2c96e Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 16 Sep 2008 09:42:28 -0600
Subject: cell: Optimized LERP with fma  Please enter the commit message for
 your changes.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 612749507be..a80d8ff5d69 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -453,8 +453,7 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
          /* d = s3 + s1(s2 - s3) */
          spe_fs(gen->f, d_reg, s2_reg, s3_reg);
-         spe_fm(gen->f, d_reg, d_reg, s1_reg);
-         spe_fa(gen->f, d_reg, d_reg, s3_reg);
+         spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -657,7 +656,7 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
          /* d = (s1 > s2) ? s1 : s2 */
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
-- 
cgit v1.2.3


From fbbaad14a6b6de07631d5a9fd6e4b847a9e9dd5a Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 16 Sep 2008 13:56:56 -0600
Subject: cell: Added DP3 and DP4 instructions

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 81 ++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index a80d8ff5d69..34d283b51ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -54,6 +54,10 @@
 #define MAX_TEMPS 16
 #define MAX_IMMED  8
 
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
 
 /**
  * Context needed during code generation.
@@ -510,6 +514,79 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DP3:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+   /* d = x * x */
+   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* d = y * y + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* d = z * z + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DP3:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+   /* d = x * x */
+   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* d = y * y + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* d = z * z + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* d = w * w + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
 /**
  * Emit set-if-greater-than.
  * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
@@ -823,6 +900,10 @@ emit_instruction(struct codegen *gen,
       return emit_MAD(gen, inst);
    case TGSI_OPCODE_LERP:
       return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
    case TGSI_OPCODE_ABS:
       return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
-- 
cgit v1.2.3


From a3a797ffa84975330d5632ce7a71c65c9c2ad0d8 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 16 Sep 2008 16:00:42 -0600
Subject: cell: Added RCP and RSQ instruction support.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 48 ++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 34d283b51ed..77386b30250 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -487,6 +487,50 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit reciprocal.  See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RCP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal sqrt.  See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RSQ:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frsqest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
 /**
  * Emit absolute value.  See emit_ADD for comments.
  */
@@ -904,6 +948,10 @@ emit_instruction(struct codegen *gen,
       return emit_DP3(gen, inst);
    case TGSI_OPCODE_DP4:
       return emit_DP4(gen, inst);
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(gen, inst);
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(gen, inst);
    case TGSI_OPCODE_ABS:
       return emit_ABS(gen, inst);
    case TGSI_OPCODE_SGT:
-- 
cgit v1.2.3


From 858ced051551aa5d0ddd41936253d3a4ee5c142f Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Wed, 17 Sep 2008 02:30:20 -0600
Subject: CELL: fleshing out the blending fragment ops

- Added two new debug flags (to be used with the CELL_DEBUG environment
  variable).  The first, "CELL_DEBUG=fragops", activates SPE fragment
  ops debug messages.  The second, "CELL_DEBUG=fragopfallback", will
  eventually be used to disable the use of generated SPE code for
  fragment ops in favor of the default fallback reference routine.
  (During development, though, the parity of this flag is reversed:
  all users will get the reference code *unless* CELL_DEBUG=fragopfallback
  is set.  This will prevent hiccups in code generation from affecting
  the other developers.)

- Formalized debug message usage and macros in spu/spu_main.c.

- Added lots of new code to ppu/cell_gen_fragment.c to extend the
  number of supported source RGB factors from 4 to 15, and to
  complete the list of supported blend equations.

More coming, to complete the source and destination RGB and alpha
factors, and to complete the rest of the fragment operations...
---
 src/gallium/drivers/cell/common.h                |  11 +-
 src/gallium/drivers/cell/ppu/cell_context.c      |   2 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 272 ++++++++++++++++++++++-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |   5 +
 src/gallium/drivers/cell/spu/spu_main.c          | 115 +++++-----
 5 files changed, 337 insertions(+), 68 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 8f088541173..f0ff96eb478 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -104,12 +104,11 @@
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
 
-
-#define CELL_DEBUG_CHECKER  (1 << 0)
-#define CELL_DEBUG_ASM      (1 << 1)
-#define CELL_DEBUG_SYNC     (1 << 2)
-
-
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
 
 /** Max instructions for doing per-fragment operations */
 #define SPU_MAX_FRAGMENT_OPS_INSTS 64
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index b418271dca2..62e213ea354 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -89,6 +89,8 @@ static const struct debug_named_value cell_debug_flags[] = {
    {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
    {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
    {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
    {NULL, 0}
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06219d4e980..2c8c9e0d2c6 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -229,7 +229,36 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
+/* This is a convenient and oft-used sequence.  It chooses
+ * the smaller of each element of reg1 and reg2, and combines them
+ * into the result register, as follows:
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put
+ * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2.
+ *
+ * Then the Select Bits (selb) instruction will take bits from
+ * reg1 where compare_reg is 0, and from reg2 where compare_reg is
+ * 1.  Ergo, result_reg will have the bits from reg1 where reg1 <= reg2,
+ * and the bits from reg2 where reg1 > reg2, which is exactly the
+ * MIN operation.
+ */
+#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\
+   int compare_reg = spe_allocate_available_register(f); \
+   spe_fcgt(f, compare_reg, reg1, reg2); \
+   spe_selb(f, result_reg, reg1, reg2, compare_reg); \
+   spe_release_register(f, compare_reg); \
+}
 
+/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN 
+ * sequence above, except that the registers specified when selecting
+ * bits are reversed.
+ */
+#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\
+   int compare_reg = spe_allocate_available_register(f); \
+   spe_fcgt(f, compare_reg, reg1, reg2); \
+   spe_selb(f, result_reg, reg2, reg1, compare_reg); \
+   spe_release_register(f, compare_reg); \
+}
 
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
@@ -242,6 +271,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  */
 static void
 gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
           struct spe_function *f,
           enum pipe_format color_format,
           int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
@@ -262,10 +292,53 @@ gen_blend(const struct pipe_blend_state *blend,
    int fbB_reg = spe_allocate_available_register(f);
    int fbA_reg = spe_allocate_available_register(f);
 
-   int one_reg = spe_allocate_available_register(f);
    int tmp_reg = spe_allocate_available_register(f);
 
-   boolean one_reg_set = false; /* avoid setting one_reg more than once */
+   /* These values might or might not eventually get put into
+    * registers.  We avoid allocating them and setting them until
+    * they're actually needed; then we avoid setting them more than
+    * once, and release them at the end of code generation.
+    */
+   boolean one_reg_set = false; 
+   int one_reg;
+#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\
+   one_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, one_reg, 1.0f); \
+   one_reg_set = true; \
+}
+#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\
+   spe_release_register(f, one_reg); \
+}
+  
+   boolean const_color_set = false;
+   int constR_reg, constG_reg, constB_reg;
+#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\
+   constR_reg = spe_allocate_available_register(f); \
+   constG_reg = spe_allocate_available_register(f); \
+   constG_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, constR_reg, blend_color->color[0]); \
+   spe_load_float(f, constG_reg, blend_color->color[1]); \
+   spe_load_float(f, constB_reg, blend_color->color[2]); \
+   const_color_set = true;\
+}
+#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\
+   spe_release_register(f, constR_reg); \
+   spe_release_register(f, constG_reg); \
+   spe_release_register(f, constB_reg); \
+}
+
+   boolean const_alpha_set = false;
+   int constA_reg;
+#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\
+   constA_reg = spe_allocate_available_register(f); \
+   spe_load_float(f, constA_reg, blend_color->color[3]); \
+   const_alpha_set = true; \
+}
+#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\
+   spe_release_register(f, constA_reg); \
+}
+
+   /* Real code starts here */
 
    ASSERT(blend->blend_enable);
 
@@ -348,30 +421,161 @@ gen_blend(const struct pipe_blend_state *blend,
 
 
    /*
-    * Compute Src RGB terms
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color).
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
       spe_move(f, term1R_reg, fragR_reg);
       spe_move(f, term1G_reg, fragG_reg);
       spe_move(f, term1B_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term1R_reg);
-      spe_zero(f, term1G_reg);
-      spe_zero(f, term1B_reg);
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
       spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
       spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
       spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - R */
+      spe_fs(f, tmp_reg, one_reg, fragR_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B */
+      spe_fs(f, tmp_reg, one_reg, fragG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, fragB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - Rfb */
+      spe_fs(f, tmp_reg, one_reg, fbR_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B */
+      spe_fs(f, tmp_reg, one_reg, fbG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, fbB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - A */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = R * tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      /* repeat for G and B with the same (1-A) factor */
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */
+      /* we'll need the optional constant {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* tmp = 1 - A */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* term = R * tmp, G*tmp, and B*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We'll need the optional blend color registers */
+      SET_CONST_COLOR_IF_UNSET(f,blend_color)
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need both the optional {1,1,1,1} register, and the optional
+       * constant color registers
+       */
+      SET_ONE_REG_IF_UNSET(f)
+      SET_CONST_COLOR_IF_UNSET(f, blend_color)
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */
+      spe_fs(f, tmp_reg, one_reg, constR_reg);
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, constG_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fs(f, tmp_reg, one_reg, constB_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional {1,1,1,1} register and the optional 
+       * constant alpha register
+       */
+      SET_ONE_REG_IF_UNSET(f)
+      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */
+      spe_fs(f, tmp_reg, one_reg, constA_reg);
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      SET_ONE_REG_IF_UNSET(f)
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg)
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* non-OpenGL cases? */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
@@ -421,6 +625,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
+         one_reg = spe_allocate_available_register(f);
          spe_load_float(f, one_reg, 1.0f);
          one_reg_set = true;
       }
@@ -432,6 +637,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
       break;
       /* XXX more cases */
+      // GL_ONE_MINUS_SRC_COLOR
+      // GL_DST_COLOR
+      // GL_ONE_MINUS_DST_COLOR
+      // GL_DST_ALPHA
+      // GL_CONSTANT_COLOR
+      // GL_ONE_MINUS_CONSTANT_COLOR
+      // GL_CONSTANT_ALPHA
+      // GL_ONE_MINUS_CONSTANT_ALPHA
    default:
       ASSERT(0);
    }
@@ -452,6 +665,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
+         one_reg = spe_allocate_available_register(f);
          spe_load_float(f, one_reg, 1.0f);
          one_reg_set = true;
       }
@@ -461,6 +675,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
       break;
       /* XXX more cases */
+      // GL_ONE_MINUS_SRC_COLOR
+      // GL_DST_COLOR
+      // GL_ONE_MINUS_DST_COLOR
+      // GL_DST_ALPHA
+      // GL_CONSTANT_COLOR
+      // GL_ONE_MINUS_CONSTANT_COLOR
+      // GL_CONSTANT_ALPHA
+      // GL_ONE_MINUS_CONSTANT_ALPHA
    default:
       ASSERT(0);
    }
@@ -479,7 +701,21 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
       spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg)
+      FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg)
+      FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg)
+      break;
+   case PIPE_BLEND_MAX:
+      FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg)
+      FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg)
+      FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg)
+      break;
    default:
       ASSERT(0);
    }
@@ -494,7 +730,15 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLEND_SUBTRACT:
       spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
       break;
-      /* XXX more cases */
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg)
+      break;
+   case PIPE_BLEND_MAX:
+      FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg)
+      break;
    default:
       ASSERT(0);
    }
@@ -514,8 +758,12 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, fbB_reg);
    spe_release_register(f, fbA_reg);
 
-   spe_release_register(f, one_reg);
    spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   RELEASE_ONE_REG_IF_USED(f)
+   RELEASE_CONST_COLOR_IF_USED(f)
+   RELEASE_CONST_ALPHA_IF_USED(f)
 }
 
 
@@ -629,6 +877,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const struct pipe_depth_stencil_alpha_state *dsa =
       &cell->depth_stencil->base;
    const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
@@ -651,7 +900,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -816,7 +1064,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
 
    if (blend->blend_enable) {
-      gen_blend(blend, f, color_format,
+      gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 2da3097983c..8a389cd6aae 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -100,14 +100,19 @@ cell_emit_state(struct cell_context *cell)
             = cell_batch_alloc(cell, sizeof(*fops));
       struct spe_function spe_code;
 
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
       /* generate new code */
       cell_gen_fragment_function(cell, &spe_code);
+
       /* put the new code into the batch buffer */
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
       fops->dsa = cell->depth_stencil->base;
       fops->blend = cell->blend->base;
+
       /* free codegen buffer */
       spe_release_func(&spe_code);
    }
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 78260c4259c..da2cb089722 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -50,7 +50,31 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
 boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
 
 struct spu_global spu;
 
@@ -133,9 +157,7 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   if (Debug)
-      printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
-             clear->surface, clear->value);
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
@@ -203,17 +225,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
 #endif /* CLEAR_OPT */
 
-   if (Debug)
-      printf("SPU %u: CLEAR SURF done\n", spu.init.id);
+   DEBUG_PRINTF("CLEAR SURF done\n");
 }
 
 
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   if (Debug)
-      printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, release->vertex_buf);
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
    ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
@@ -228,16 +247,30 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
 
-   /* Point function pointer at new code */
-   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if (spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   /* otherwise, the default fallback code remains in place */
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
@@ -247,8 +280,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 {
-   if (Debug)
-      printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id);
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_program_code, fp->code,
           SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
@@ -262,9 +294,7 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
-   if (Debug)
-      printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             spu.init.id,
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
              cmd->width,
              cmd->height,
              cmd->color_start,
@@ -309,9 +339,7 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   if (Debug)
-      printf("SPU %u: SAMPLER [%u]\n",
-             spu.init.id, sampler->unit);
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
@@ -328,11 +356,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint width = texture->width;
    const uint height = texture->height;
 
-   if (Debug) {
-      printf("SPU %u: TEXTURE [%u] at %p  size %u x %u\n", spu.init.id,
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
              texture->unit, texture->start,
              texture->width, texture->height);
-   }
 
    spu.texture[unit].start = texture->start;
    spu.texture[unit].width = width;
@@ -351,10 +377,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug) {
-      printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
-             vinfo->num_attribs);
-   }
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
    ASSERT(vinfo->num_attribs >= 1);
    ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
@@ -393,8 +416,7 @@ cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
 static void
 cmd_finish(void)
 {
-   if (Debug)
-      printf("SPU %u: FINISH\n", spu.init.id);
+   DEBUG_PRINTF("FINISH\n");
    really_clear_tiles(0);
    /* wait for all outstanding DMAs to finish */
    mfc_write_tag_mask(~0);
@@ -419,9 +441,8 @@ cmd_batch(uint opcode)
    const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
-   if (Debug)
-      printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.buffers[buf]);
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
@@ -440,8 +461,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   if (Debug)
-      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+   DEBUG_PRINTF("release batch buf %u\n", buf);
    release_buffer(buf);
 
    /*
@@ -571,8 +591,7 @@ cmd_batch(uint opcode)
       }
    }
 
-   if (Debug)
-      printf("SPU %u: BATCH complete\n", spu.init.id);
+   DEBUG_PRINTF("BATCH complete\n");
 }
 
 
@@ -585,8 +604,7 @@ main_loop(void)
    struct cell_command cmd;
    int exitFlag = 0;
 
-   if (Debug)
-      printf("SPU %u: Enter main loop\n", spu.init.id);
+   DEBUG_PRINTF("Enter main loop\n");
 
    ASSERT((sizeof(struct cell_command) & 0xf) == 0);
    ASSERT_ALIGN16(&cmd);
@@ -595,14 +613,12 @@ main_loop(void)
       unsigned opcode;
       int tag = 0;
 
-      if (Debug)
-         printf("SPU %u: Wait for cmd...\n", spu.init.id);
+      DEBUG_PRINTF("Wait for cmd...\n");
 
       /* read/wait from mailbox */
       opcode = (unsigned int) spu_read_in_mbox();
 
-      if (Debug)
-         printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode);
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
 
       /* command payload */
       mfc_get(&cmd,  /* dest */
@@ -619,8 +635,7 @@ main_loop(void)
 
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
-         if (Debug)
-            printf("SPU %u: EXIT\n", spu.init.id);
+         DEBUG_PRINTF("EXIT\n");
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
@@ -632,13 +647,12 @@ main_loop(void)
          cmd_batch(opcode);
          break;
       default:
-         printf("Bad opcode!\n");
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
       }
 
    }
 
-   if (Debug)
-      printf("SPU %u: Exit main loop\n", spu.init.id);
+   DEBUG_PRINTF("Exit main loop\n");
 
    spu_dcache_report();
 }
@@ -653,7 +667,8 @@ one_time_init(void)
    invalidate_tex_cache();
 
    /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
    spu.fragment_ops = spu_fallback_fragment_ops;
 }
@@ -685,8 +700,8 @@ main(main_param_t speid, main_param_t argp)
 
    one_time_init();
 
-   if (Debug)
-      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
+   DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
-- 
cgit v1.2.3


From 05aeb92a092c26e7773beb95692fc72e70a40e56 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 17 Sep 2008 08:11:42 -0600
Subject: cell: dump generated code if CELL_DEBUG=asm

---
 progs/demos/fslight.c                            |  2 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 13 ++++++++++-
 src/gallium/drivers/cell/ppu/cell_screen.c       | 14 ++++++------
 src/gallium/drivers/softpipe/sp_fs_exec.c        | 29 ++++++++++++++++++++----
 4 files changed, 44 insertions(+), 14 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/progs/demos/fslight.c b/progs/demos/fslight.c
index e79b5cc1970..c7931f4697e 100644
--- a/progs/demos/fslight.c
+++ b/progs/demos/fslight.c
@@ -45,7 +45,7 @@ static GLint uTexture;
 
 static GLuint SphereList, RectList, CurList;
 static GLint win = 0;
-static GLboolean anim = GL_TRUE;
+static GLboolean anim = 0*GL_TRUE;
 static GLboolean wire = GL_FALSE;
 static GLboolean pixelLight = GL_TRUE;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 2c8c9e0d2c6..99407b8acee 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -900,6 +900,14 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      spe_comment(f, -4, "Begin per-fragment ops");
+   }
+
    spe_allocate_register(f, x_reg);
    spe_allocate_register(f, y_reg);
    spe_allocate_register(f, color_tile_reg);
@@ -1114,5 +1122,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_release_register(f, fbRGBA_reg);
    spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
-}
 
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_comment(f, -4, "End per-fragment ops");
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 139b3719b62..47ba6fa2909 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return CELL_MAX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
-      return 0;
+      return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
-      return 0;
+      return 1;
    case PIPE_CAP_GLSL:
       return 1;
    case PIPE_CAP_S3TC:
@@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
-      return 0;
+      return 1;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 1;
    case PIPE_CAP_OCCLUSION_QUERY:
-      return 0;
+      return 1;
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
-      return 0;
+      return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 12; /* max 2Kx2K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
@@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 12; /* max 2Kx2K */
    default:
-      return 0;
+      return 10;
    }
 }
 
@@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param)
       return 16.0; /* arbitrary */
 
    default:
-      return 0;
+      return 10;
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 701ee4c72f2..ffc0c5e578e 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -39,11 +39,20 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
 
-struct sp_exec_fragment_shader {
+struct sp_exec_fragment_shader
+{
    struct sp_fragment_shader base;
+   struct tgsi_token *machine_tokens;
 };
 
 
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
 
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
@@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base,
 	      struct tgsi_exec_machine *machine,
 	      struct tgsi_sampler *samplers )
 {
-   tgsi_exec_machine_bind_shader( machine,
-				  base->shader.tokens,
-				  PIPE_MAX_SAMPLERS,
-				  samplers );
+   struct sp_exec_fragment_shader *spefs =
+      sp_exec_fragment_shader(base);
+
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (spefs->machine_tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+      spefs->machine_tokens = base->shader.tokens;
+   }
 }
 
 
-- 
cgit v1.2.3


From f631093ce76ad14dee63293761d7da7b7b42fc6d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 17 Sep 2008 08:17:02 -0600
Subject: cell: example of doing fs/fm sequence with fnms in blending

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 99407b8acee..2c80dd712e6 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -623,6 +623,7 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+#if 0
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
          one_reg = spe_allocate_available_register(f);
@@ -635,6 +636,15 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
       spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
       spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
+#else
+      /* Compute:  term2x = fbx * (1.0 - fragA)
+       * Which is:  term2x = fbx - fbx * fragA
+       * Use fnms t,a,b,c which computes t=c-a*b
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+#endif
       break;
       /* XXX more cases */
       // GL_ONE_MINUS_SRC_COLOR
@@ -663,6 +673,7 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+#if 0
       /* one = {1.0, 1.0, 1.0, 1.0} */
       if (!one_reg_set) {
          one_reg = spe_allocate_available_register(f);
@@ -673,6 +684,13 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, tmp_reg, one_reg, fragA_reg);
       /* termA = fbA * tmp */
       spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+#else
+      /* Compute:  term2A = fbA * (1.0 - fragA)
+       * Which is:  term2A = fbA - fbA * fragA
+       * Use fnms t,a,b,c which computes t=c-a*b
+       */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+#endif
       break;
       /* XXX more cases */
       // GL_ONE_MINUS_SRC_COLOR
-- 
cgit v1.2.3


From f8bba34d4e12ef4c620cac881a4b697a1e668377 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 18 Sep 2008 01:29:41 -0600
Subject: CELL: finish fragment ops blending (except for unusual D3D modes)

- Added new "macro" functions spe_float_min() and spe_float_max()
  to rtasm_ppc_spe.{ch}.  These emit instructions that cause
  the minimum or maximum of each element in a vector of floats
  to be saved in the destination register.

- Major changes to cell_gen_fragment.c to implement all the blending
  modes (except for the mysterious D3D-based PIPE_BLENDFACTOR_SRC1_COLOR,
  PIPE_BLENDFACTOR_SRC1_ALPHA, PIPE_BLENDFACTOR_INV_SRC1_COLOR, and
  PIPE_BLENDFACTOR_INV_SRC1_ALPHA).

- Some revamping of code in cell_gen_fragment.c: use the new spe_float_min()
  and spe_float_max() functions (instead of expanding these calculations
  inline via macros); create and use an inline utility function for handling
  "optional" register allocation (for the {1,1,1,1} vector, and the
  blend color vectors) instead of expanding with macros; use the Float
  Multiply and Subtract (fnms) instruction to simplify and optimize many
  blending calculations.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  41 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   8 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 546 ++++++++++++++---------
 3 files changed, 377 insertions(+), 218 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 870ae802c52..12e0826fb9b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -384,7 +384,7 @@ void spe_release_func(struct spe_function *p)
 
 
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
@@ -646,5 +646,44 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
+/* For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/* For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
 
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 25790452325..4ef05ea27d1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -322,6 +322,14 @@ spe_zero(struct spe_function *p, unsigned rT);
 extern void
 spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
 
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 2c80dd712e6..9d25e820ad9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -229,35 +229,26 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
-/* This is a convenient and oft-used sequence.  It chooses
- * the smaller of each element of reg1 and reg2, and combines them
- * into the result register, as follows:
- * 
- * The Float Compare Greater Than (fcgt) instruction will put
- * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2.
- *
- * Then the Select Bits (selb) instruction will take bits from
- * reg1 where compare_reg is 0, and from reg2 where compare_reg is
- * 1.  Ergo, result_reg will have the bits from reg1 where reg1 <= reg2,
- * and the bits from reg2 where reg1 > reg2, which is exactly the
- * MIN operation.
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
  */
-#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg1, reg2, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   spe_load_float(f, *r, value);
+   *is_already_set = true;
 }
 
-/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN 
- * sequence above, except that the registers specified when selecting
- * bits are reversed.
- */
-#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg2, reg1, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
 }
 
 /**
@@ -294,51 +285,15 @@ gen_blend(const struct pipe_blend_state *blend,
 
    int tmp_reg = spe_allocate_available_register(f);
 
-   /* These values might or might not eventually get put into
-    * registers.  We avoid allocating them and setting them until
-    * they're actually needed; then we avoid setting them more than
-    * once, and release them at the end of code generation.
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
     */
-   boolean one_reg_set = false; 
-   int one_reg;
-#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\
-   one_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, one_reg, 1.0f); \
-   one_reg_set = true; \
-}
-#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\
-   spe_release_register(f, one_reg); \
-}
-  
-   boolean const_color_set = false;
-   int constR_reg, constG_reg, constB_reg;
-#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\
-   constR_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constR_reg, blend_color->color[0]); \
-   spe_load_float(f, constG_reg, blend_color->color[1]); \
-   spe_load_float(f, constB_reg, blend_color->color[2]); \
-   const_color_set = true;\
-}
-#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\
-   spe_release_register(f, constR_reg); \
-   spe_release_register(f, constG_reg); \
-   spe_release_register(f, constB_reg); \
-}
-
-   boolean const_alpha_set = false;
-   int constA_reg;
-#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\
-   constA_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constA_reg, blend_color->color[3]); \
-   const_alpha_set = true; \
-}
-#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\
-   spe_release_register(f, constA_reg); \
-}
-
-   /* Real code starts here */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
 
    ASSERT(blend->blend_enable);
 
@@ -419,10 +374,11 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_release_register(f, mask_reg);
    }
 
-
    /*
     * Compute Src RGB terms.  We're actually looking for the value
-    * of (the appropriate RGB factors) * (the incoming source RGB color).
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
@@ -450,18 +406,13 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - R */
-      spe_fs(f, tmp_reg, one_reg, fragR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fragG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fragB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
@@ -470,30 +421,22 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - Rfb */
-      spe_fs(f, tmp_reg, one_reg, fbR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fbG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fbB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B with the same (1-A) factor */
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_ALPHA:
       /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
@@ -502,19 +445,19 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fbA_reg);
-      /* term = R * tmp, G*tmp, and B*tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
-      /* We'll need the optional blend color registers */
-      SET_CONST_COLOR_IF_UNSET(f,blend_color)
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
       /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
       spe_fm(f, term1R_reg, fragR_reg, constR_reg);
       spe_fm(f, term1G_reg, fragG_reg, constG_reg);
@@ -522,55 +465,61 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
       /* we'll need the optional constant alpha register */
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
       /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
       spe_fm(f, term1R_reg, fragR_reg, constA_reg);
       spe_fm(f, term1G_reg, fragG_reg, constA_reg);
       spe_fm(f, term1B_reg, fragB_reg, constA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      /* We need both the optional {1,1,1,1} register, and the optional
-       * constant color registers
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_COLOR_IF_UNSET(f, blend_color)
-      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */
-      spe_fs(f, tmp_reg, one_reg, constR_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      /* We need the optional {1,1,1,1} register and the optional 
-       * constant alpha register
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
-      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */
-      spe_fs(f, tmp_reg, one_reg, constA_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
       /* We'll need the optional {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
       /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
        * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
        */
       /* tmp = 1 - Afb */
       spe_fs(f, tmp_reg, one_reg, fbA_reg);
       /* tmp = min(A,tmp) */
-      FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg)
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
       /* term = R*tmp */
       spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
       spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
       spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
       break;
 
-      /* non-OpenGL cases? */
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
    case PIPE_BLENDFACTOR_SRC1_COLOR:
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -581,132 +530,293 @@ gen_blend(const struct pipe_blend_state *blend,
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-#else
-      /* Compute:  term2x = fbx * (1.0 - fragA)
-       * Which is:  term2x = fbx - fbx * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
       spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
       spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
-#else
-      /* Compute:  term2A = fbA * (1.0 - fragA)
-       * Which is:  term2A = fbA - fbA * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -725,14 +835,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    default:
       ASSERT(0);
@@ -752,10 +862,10 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    default:
       ASSERT(0);
@@ -779,9 +889,11 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, tmp_reg);
 
    /* Free any optional registers that actually got used */
-   RELEASE_ONE_REG_IF_USED(f)
-   RELEASE_CONST_COLOR_IF_USED(f)
-   RELEASE_CONST_ALPHA_IF_USED(f)
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
 }
 
 
-- 
cgit v1.2.3


From c868a1c32d70295f425333f9e8a35235b129704b Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 18 Sep 2008 10:36:09 -0600
Subject: cell: Added SGE and SLE instructions

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 62 ++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 77386b30250..92681408e9f 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -696,6 +696,68 @@ emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit set-if_greater-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 >= s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & ~one_reg */
+         spe_andc(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 <= s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & ~one_reg */
+         spe_andc(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
 /**
  * Emit set-if_equal.  See emit_SGT for comments.
  */
-- 
cgit v1.2.3


From 3d2449247afce18e6a0604b794778d1373c879be Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 18 Sep 2008 10:37:45 -0600
Subject: cell:  Added SGE and SLE instructions to dispatch function

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 92681408e9f..2607b410aa3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1020,6 +1020,10 @@ emit_instruction(struct codegen *gen,
       return emit_SGT(gen, inst);
    case TGSI_OPCODE_SLT:
       return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(gen, inst);
+   case TGSI_OPCODE_SLE:
+      return emit_SLE(gen, inst);
    case TGSI_OPCODE_SEQ:
       return emit_SEQ(gen, inst);
    case TGSI_OPCODE_SNE:
-- 
cgit v1.2.3


From 15fceac0404f450f026f10bd2f4bdd0c939b5d00 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 18 Sep 2008 11:11:49 -0600
Subject: cell: Fix bug with complement logic for SGE and SLE

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 42 +++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 2607b410aa3..4f01897199c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -716,8 +716,8 @@ emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
-         /* d = d & ~one_reg */
-         spe_andc(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -747,8 +747,8 @@ emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
          spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
 
          /* convert d from 0x0/0xffffffff to 0.0/1.0 */
-         /* d = d & ~one_reg */
-         spe_andc(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -821,6 +821,38 @@ emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit compare.  See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "CMP:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1028,6 +1060,8 @@ emit_instruction(struct codegen *gen,
       return emit_SEQ(gen, inst);
    case TGSI_OPCODE_SNE:
       return emit_SNE(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
    case TGSI_OPCODE_MAX:
       return emit_MAX(gen, inst);
    case TGSI_OPCODE_MIN:
-- 
cgit v1.2.3


From 698bffb8844f6f45e09ed0c9fea39298ac6423d2 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 18 Sep 2008 14:49:00 -0600
Subject: cell: Added CMP instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 4f01897199c..6f2b89c695c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -835,15 +835,15 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
 
-         /* d = (s1 != s2) */
-         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
-         spe_nor(gen->f, d_reg, d_reg, d_reg);
-
-         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
-         /* d = d & one_reg */
-         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
-- 
cgit v1.2.3


From 4485ac87c2cf69bef443ac36cccaa70054c6a7bb Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 18 Sep 2008 16:36:37 -0600
Subject: CELL: mark several transient files as .gitignore

progs/demos: added new demo "fbo_firecube"

progs/glsl: added new demo "pointcoord"

src/gallium/drivers/cell/spu: added the g3d_spu executable, a Cell SPU
executable file, which seems to be occasionally built as part of the
cell driver

src/glu/sgi: added "exptmp", a byproduct of the "mklib" process that
sometimes gets deleted and sometimes not.
---
 progs/demos/.gitignore                  | 1 +
 progs/glsl/.gitignore                   | 1 +
 src/gallium/drivers/cell/spu/.gitignore | 1 +
 src/glu/sgi/.gitignore                  | 1 +
 4 files changed, 4 insertions(+)
 create mode 100644 src/gallium/drivers/cell/spu/.gitignore
 create mode 100644 src/glu/sgi/.gitignore

(limited to 'src/gallium/drivers/cell')

diff --git a/progs/demos/.gitignore b/progs/demos/.gitignore
index 3693fafd4ee..f033a0505d8 100644
--- a/progs/demos/.gitignore
+++ b/progs/demos/.gitignore
@@ -8,6 +8,7 @@ cubemap
 drawpix
 engine
 extfuncs.h
+fbo_firecube
 fire
 fogcoord
 fplight
diff --git a/progs/glsl/.gitignore b/progs/glsl/.gitignore
index 09340ff2adb..978e31c6cc9 100644
--- a/progs/glsl/.gitignore
+++ b/progs/glsl/.gitignore
@@ -7,6 +7,7 @@ extfuncs.h
 mandelbrot
 multitex
 noise
+pointcoord
 points
 readtex.c
 readtex.h
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 00000000000..2be9a2d3242
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/glu/sgi/.gitignore b/src/glu/sgi/.gitignore
new file mode 100644
index 00000000000..279ea7d4345
--- /dev/null
+++ b/src/glu/sgi/.gitignore
@@ -0,0 +1 @@
+exptmp
-- 
cgit v1.2.3


From a57fbe53dcb54694da9c9b4be1533c9d800079d2 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 19 Sep 2008 01:55:00 -0600
Subject: CELL: add codegen for logic op, color mask

- rtasm_ppc_spe.c, rtasm_ppc_spe.h: added a new macro function
  "spe_load_uint" for loading and splatting unsigned integers
  in a register; it will use "ila" for values 18 bits or less,
  "ilh" for word values that are symmetric across halfwords,
  "ilhu" for values that have zeroes in their bottom halfwords,
  or "ilhu" followed by "iohl" for general 32-bit values.

  Of the 15 color masks of interest, 4 are 18 bits or less,
  2 are symmetric across halfwords, 3 are zero in the bottom
  halfword, and 6 require two instructions to load.

- cell_gen_fragment.c: added full codegen for logic op and
  color mask.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  23 +++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   4 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 143 ++++++++++++++++++++++-
 3 files changed, 163 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 12e0826fb9b..f60bfba3f51 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -592,11 +592,32 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
 }
 
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, we have
+    * to use ilhu followed by iohl.
+    */
+   if ((ui & 0xfffc0000) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else {
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
+   }
+}
+
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4ef05ea27d1..09400b3fb2a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -302,6 +302,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 9d25e820ad9..899d8423b24 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -902,8 +902,69 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
@@ -912,11 +973,81 @@ gen_colormask(uint colormask,
               struct spe_function *f,
               int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+
+   /* The color mask operation can prevent any set of color
+    * components in the incoming fragment from being written to the frame 
+    * buffer; we do this by replacing the masked components of the 
+    * fragment with the frame buffer values.
+    *
+    * There are only 16 possibilities, with a unique mask for
+    * each of the possibilities.  (Technically, there are only 15
+    * possibilities, since we shouldn't be called for the one mask
+    * that does nothing, but the complete implementation is here
+    * anyway to avoid confusion.)
+    *
+    * We implement this via a constant static array which we'll index 
+    * into to get the correct mask.
+    * 
+    * We're dependent on the mask values being low-order bits,
+    * with particular values for each bit; so we start with a
+    * few assertions, which will fail if any of the values were
+    * to change.
+    */
+   ASSERT(PIPE_MASK_R == 0x1);
+   ASSERT(PIPE_MASK_G == 0x2);
+   ASSERT(PIPE_MASK_B == 0x4);
+   ASSERT(PIPE_MASK_A == 0x8);
 
+   /* Here's the list of all possible colormasks, indexed by the
+    * value of the combined mask specifier.
+    */
+   static const unsigned int colormasks[16] = {
+      0x00000000, /* 0: all colors masked */
+      0xff000000, /* 1: PIPE_MASK_R */
+      0x00ff0000, /* 2: PIPE_MASK_G */
+      0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
+      0x0000ff00, /* 4: PIPE_MASK_B */
+      0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
+      0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
+      0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
+      0x000000ff, /* 8: PIPE_MASK_A */
+      0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
+      0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
+      0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
+      0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
+      0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
+      0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+      0xffffffff  /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+   };
+
+   /* Get a temporary register to hold the mask */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* Look up the desired mask directly and load it into the mask register.
+    * This will load the same mask into each of the four words in the
+    * mask register.
+    */
+   spe_load_uint(f, colormask_reg, colormasks[colormask]);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
 
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
 
 /**
  * Generate code to pack a quad of float colors into a four 32-bit integers.
@@ -1223,7 +1354,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
+      if (blend->colormask != PIPE_MASK_RGBA) {
          gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
-- 
cgit v1.2.3


From 0838b702750d85b0284a97be211fa379e9f8d8d8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 09:36:29 -0600
Subject: cell: change spe_complement() to take a src and dst reg, like other
 instructions

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      | 14 ++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |  4 ++--
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f60bfba3f51..85280f680a4 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -623,9 +623,9 @@ spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
@@ -667,7 +667,8 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
-/* For each 32-bit float element of rA and rB, choose the smaller of the
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
  * two, compositing them into the rT register.
  * 
  * The Float Compare Greater Than (fcgt) instruction will put 1s into
@@ -683,7 +684,7 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
  * like "x = min(x, a)", we always allocate a new register to be safe.
  */
 void 
-spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
@@ -691,7 +692,8 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
    spe_release_register(p, compare_reg);
 }
 
-/* For each 32-bit float element of rA and rB, choose the greater of the
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
  * two, compositing them into the rT register.
  * 
  * The logic is similar to that of spe_float_min() above; the only
@@ -699,7 +701,7 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
  * so that the larger of the two is selected instead of the smaller.
  */
 void 
-spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 09400b3fb2a..8a0d70fdac5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -310,9 +310,9 @@ spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6f2b89c695c..d835aae2552 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -924,7 +924,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    /* tmp = (s1_reg == 0) */
    spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
    /* tmp = !tmp */
-   spe_complement(gen->f, tmp_reg);
+   spe_complement(gen->f, tmp_reg, tmp_reg);
    /* exec_mask = exec_mask & tmp */
    spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
 
@@ -944,7 +944,7 @@ emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_comment(gen->f, -4, "ELSE:");
 
    /* exec_mask = !exec_mask */
-   spe_complement(gen->f, exec_reg);
+   spe_complement(gen->f, exec_reg, exec_reg);
 
    return true;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 899d8423b24..06a9fa102f9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -920,7 +920,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
          /* andc R, A, B computes R = A & ~B */
@@ -941,7 +941,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          break;
       case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
          spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_NOOP: /* d */
          spe_move(f, fragRGBA_reg, fbRGBA_reg);
-- 
cgit v1.2.3


From de0a6dc04a5b508472cc0cce4481ac3bb95fda3b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 10:42:21 -0600
Subject: cell: the test for CELL_DEBUG_FRAGMENT_OP_FALLBACK in
 cmd_state_fragment_ops() was inverted

---
 src/gallium/drivers/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index da2cb089722..d99dd12d2a0 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -267,7 +267,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * final code we'll always use codegen and won't even provide the
     * raw state records that the fallback code requires.
     */
-   if (spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) {
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
       spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
    }
    /* otherwise, the default fallback code remains in place */
-- 
cgit v1.2.3


From 3c6bb15b7ae1c08b1ddde9e0bfb4796fd68a8a0b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 10:43:04 -0600
Subject: cell: fix a comment

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 06a9fa102f9..c09d727621c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1050,7 +1050,7 @@ gen_colormask(uint colormask,
 }
 
 /**
- * Generate code to pack a quad of float colors into a four 32-bit integers.
+ * Generate code to pack a quad of float colors into four 32-bit integers.
  *
  * \param f             SPE function to append instruction onto.
  * \param color_format  the dest color packing format
-- 
cgit v1.2.3


From 0500ae574f4192dd1972baa23e9c62f992042ab9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 10:50:46 -0600
Subject: cell: issue warning to stderr when using fallback fragment ops

---
 src/gallium/drivers/cell/spu/spu_main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index d99dd12d2a0..6b624175584 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -247,6 +247,8 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
+   static int warned = 0;
+
    DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
    memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
@@ -270,7 +272,13 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
       spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
    }
-   /* otherwise, the default fallback code remains in place */
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
 
    spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
    spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-- 
cgit v1.2.3


From 7abf2358d739b126336c4837156816ce03f2b9d6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 12:52:41 -0600
Subject: cell: flesh out support for other Z/stencil format

Also: improve float/int Z conversion.
Use clgt instead of cgt in depth test since we're comparing unsigned values.
---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 100 +++++++++++++++--------
 1 file changed, 64 insertions(+), 36 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index c09d727621c..1837b4c79bd 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -60,6 +60,9 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
                struct spe_function *f,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
    ASSERT(dsa->depth.enabled);
 
    switch (dsa->depth.func) {
@@ -79,28 +82,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
 
    case PIPE_FUNC_GREATER:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LESS:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & zmask) */
       spe_and(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_LEQUAL:
       /* zmask = (ifragZ > ref) */
-      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
 
    case PIPE_FUNC_GEQUAL:
       /* zmask = (ref > ifragZ) */
-      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
       /* mask = (mask & ~zmask) */
       spe_andc(f, mask_reg, mask_reg, zmask_reg);
       break;
@@ -1066,13 +1069,16 @@ gen_pack_colors(struct spe_function *f,
                 int r_reg, int g_reg, int b_reg, int a_reg,
                 int rgba_reg)
 {
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
    /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
    spe_cfltu(f, r_reg, r_reg, 32);
    spe_cfltu(f, g_reg, g_reg, 32);
    spe_cfltu(f, b_reg, b_reg, 32);
    spe_cfltu(f, a_reg, a_reg, 32);
 
-   /* Shift the most significant bytes to least the significant positions.
+   /* Shift the most significant bytes to the least significant positions.
     * I.e.: reg = reg >> 24
     */
    spe_rotmi(f, r_reg, r_reg, -24);
@@ -1104,9 +1110,12 @@ gen_pack_colors(struct spe_function *f,
     * OR-ing all those together gives us four packed colors:
     *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
     */
-   spe_or(f, rgba_reg, r_reg, g_reg);
-   spe_or(f, rgba_reg, rgba_reg, b_reg);
-   spe_or(f, rgba_reg, rgba_reg, a_reg);
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
 }
 
 
@@ -1227,33 +1236,49 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             spe_release_register(f, mask_reg);
             /* OK, fbZ_reg has four 24-bit Z values now */
          }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZ_reg, fbZS_reg);
+            /* OK, fbZ_reg has four 32-bit Z values now */
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            spe_move(f, fbZ_reg, fbZS_reg);
+            /* OK, fbZ_reg has four 16-bit Z values now */
+         }
          else {
-            /* XXX handle other z/stencil formats */
-            ASSERT(0);
+            ASSERT(0);  /* invalid format */
          }
 
-         /* Convert fragZ values from float[4] to uint[4] */
+         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM ||
              zs_format == PIPE_FORMAT_Z24S8_UNORM ||
              zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* 24-bit Z values */
-            int scale_reg = spe_allocate_available_register(f);
-
-            /* scale_reg[0,1,2,3] = float(2^24-1) */
-            spe_load_float(f, scale_reg, (float) 0xffffff);
-
-            /* XXX these two instructions might be combined */
-            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
-
-            spe_release_register(f, scale_reg);
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* fragZ = fragZ >> 8 */
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
          }
-         else {
-            /* XXX handle 16-bit Z format */
-            ASSERT(0);
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* fragZ = fragZ >> 16 */
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
          }
       }
+      else {
+         /* no Z test, but set Z to zero so we don't OR-in garbage below */
+         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      }
+
 
       if (dsa->stencil[0].enabled) {
          /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
@@ -1268,7 +1293,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             ASSERT(0);
          }
       }
-
+      else {
+         /* no stencil test, but set to zero so we don't OR-in garbage below */
+         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
+      }
 
       if (dsa->stencil[0].enabled) {
          /* XXX this may involve depth testing too */
@@ -1296,22 +1324,22 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
             spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
             spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
-         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
-            /* XXX to do */
-            ASSERT(0);
+            ASSERT(0);   /* XXX to do */
          }
          else {
-            /* bad zs_format */
-            ASSERT(0);
+            ASSERT(0); /* bad zs_format */
          }
 
          /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
-- 
cgit v1.2.3


From e9c05c5b82fdae75a3dccad23203987c277572b0 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Fri, 19 Sep 2008 12:59:36 -0600
Subject: cell:  Fixed bugs with DP3 and DP4, they match softpipe results now.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 40 +++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d835aae2552..a84b565e5c3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -76,7 +76,7 @@ struct codegen
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[4];
+   int itemps[10];
 
    /** Current IF/ELSE/ENDIF nesting level */
    int if_nesting;
@@ -586,9 +586,10 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
       }
    }
+
+   free_itemps(gen);
    return true;
 }
 
@@ -625,9 +626,10 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
       }
    }
+
+   free_itemps(gen);
    return true;
 }
 
@@ -853,6 +855,38 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit floor.  See emit_SGT for comments.
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
-- 
cgit v1.2.3


From 1031638c2df825acc06a6180411caa4d9ebd5b31 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Fri, 19 Sep 2008 14:18:39 -0600
Subject: cell: Added FLR instruction.  Verified the following instructions
 match softpipe:  MOV, ADD, MUL, SGE, SUB, MAD, ABS, SLT, MIN, MAX, LRP, DP3,
 DP4, CMP, FLR

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index a84b565e5c3..d2376aa0c22 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -856,7 +856,10 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 /**
- * Emit floor.  See emit_SGT for comments.
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
  */
 static boolean
 emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
@@ -868,16 +871,22 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int zero_reg = get_itemp(gen);
-   
-         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
 
-         /* d = (s1 < 0) ? s2 : s3 */
-         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1100,6 +1109,8 @@ emit_instruction(struct codegen *gen,
       return emit_MAX(gen, inst);
    case TGSI_OPCODE_MIN:
       return emit_MIN(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
-- 
cgit v1.2.3


From 33bef5866c81a7f358c0aa2e37e20443dafb9eb2 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Fri, 19 Sep 2008 15:10:25 -0600
Subject: cell: Added FRC instruction

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 41 ++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d2376aa0c22..1bc803d5908 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -886,7 +886,45 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
          /* Convert int to float */
          spe_csflt(gen->f, d_reg, d_reg, 0);
 
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit frac.  
+ * Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
+         /* If negative, subtract 1.0 */
+         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
+         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, d_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         /* d = s1 - FLR(s1) */
+         spe_fs(gen->f, d_reg, s1_reg, d_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -896,6 +934,7 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1111,6 +1150,8 @@ emit_instruction(struct codegen *gen,
       return emit_MIN(gen, inst);
    case TGSI_OPCODE_FLR:
       return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
-- 
cgit v1.2.3


From aca74a4d92ba6f99d756ab703a78efc3918b3840 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:55:10 -0600
Subject: cell: make sure the fragment ops and fragment shader code buffer is
 at a 32-byte boundary

To make sure even/odd instructions hit the right pipes.
---
 src/gallium/drivers/cell/spu/spu_main.c | 4 +++-
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6b624175584..b4d30228f7a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -705,6 +705,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
 
    one_time_init();
 
@@ -721,7 +723,7 @@ main(main_param_t speid, main_param_t argp)
 
 #if 0
    if (spu.init.id==0)
-      spu_test_misc();
+      spu_test_misc(spu.init.id);
 #endif
 
    main_loop();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2c7b6258402..72e540fcff2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops machine code, at 32-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+   /** Current fragment program machine code, at 32-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
-- 
cgit v1.2.3


From 56c476395ffdff2cfbc0adb9b87e5b308ee3066a Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 22 Sep 2008 10:54:50 -0600
Subject: cell: Added DPH instruction and verified against softpipe.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 41 ++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 1bc803d5908..f4e651c8ebf 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -633,6 +633,45 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "DPH:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
+   /* d = x * x */
+   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* d = y * y + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* d = z * z + d */
+   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* d = w + d */
+   spe_fa(gen->f, d_reg, s2_reg, d_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
 /**
  * Emit set-if-greater-than.
  * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
@@ -1124,6 +1163,8 @@ emit_instruction(struct codegen *gen,
       return emit_DP3(gen, inst);
    case TGSI_OPCODE_DP4:
       return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
    case TGSI_OPCODE_RCP:
       return emit_RCP(gen, inst);
    case TGSI_OPCODE_RSQ:
-- 
cgit v1.2.3


From 6b3ec9ec2b96e33f975852ee9f4751c6fefe9869 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 22 Sep 2008 13:13:50 -0600
Subject: cell: Added TRUNC, SWZ (extended) and XPD instructions, verified
 against softpipe.  Optimized FLR and FRC.  Fixed writeback logic for DP3, DP4
 and DPH.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 201 ++++++++++++++++++++++-------
 1 file changed, 156 insertions(+), 45 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index f4e651c8ebf..4b8189207d3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -181,8 +181,8 @@ get_src_reg(struct codegen *gen,
    boolean reg_is_itemp = FALSE;
    uint sign_op;
 
-   assert(swizzle >= 0);
-   assert(swizzle <= 3);
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
 
    channel = swizzle;
 
@@ -192,12 +192,28 @@ get_src_reg(struct codegen *gen,
       break;
    case TGSI_FILE_INPUT:
       {
-         /* offset is measured in quadwords, not bytes */
-         int offset = src->SrcRegister.Index * 4 + channel;
-         reg = get_itemp(gen);
-         reg_is_itemp = TRUE;
-         /* Load:  reg = memory[(machine_reg) + offset] */
-         spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+         if(channel == TGSI_EXTSWIZZLE_ONE)
+         {
+            /* Load const one float and early out */
+            reg = get_const_one_reg(gen);
+            return reg;
+         }
+         else if(channel == TGSI_EXTSWIZZLE_ZERO)
+         {
+            /* Load const zero float and early out */
+            reg = get_itemp(gen);
+            spe_xor(gen->f, reg, reg, reg);
+            return reg;
+         }
+         else
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + channel;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+         }
       }
       break;
    case TGSI_FILE_IMMEDIATE:
@@ -355,8 +371,6 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
-
-
 /**
  * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
  * becomes (up to) four SPU "fa" instructions because we're doing SOA
@@ -569,23 +583,23 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
-   /* d = x * x */
-   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+   int tmp_reg = get_itemp(gen);
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* d = y * y + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* d = z * z + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -600,32 +614,32 @@ static boolean
 emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
-   spe_comment(gen->f, -4, "DP3:");
+   spe_comment(gen->f, -4, "DP4:");
 
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
-   /* d = x * x */
-   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+   int tmp_reg = get_itemp(gen);
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* d = y * y + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* d = z * z + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
-   /* d = w * w + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = w0 * w1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -644,27 +658,28 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int d_reg = get_dst_reg(gen, CHAN_X, &inst->FullDstRegisters[0]);
-   /* d = x * x */
-   spe_fm(gen->f, d_reg, s1_reg, s2_reg);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* d = y * y + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
    s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* d = z * z + d */
-   spe_fma(gen->f, d_reg, s1_reg, s2_reg, d_reg);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
    s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
-   /* d = w + d */
-   spe_fa(gen->f, d_reg, s2_reg, d_reg);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -672,6 +687,62 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   spe_comment(gen->f, -4, "XPD:");
+
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
 /**
  * Emit set-if-greater-than.
  * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
@@ -894,6 +965,37 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
    return true;
 }
 
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "TRUNC:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
 /**
  * Emit floor.  
  * If negative int subtract one
@@ -907,6 +1009,9 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    spe_comment(gen->f, -4, "FLR:");
 
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+   
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -914,9 +1019,8 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
-         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
          spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
@@ -944,6 +1048,9 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    spe_comment(gen->f, -4, "FLR:");
 
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
@@ -951,9 +1058,8 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_xor(gen->f, tmp_reg, tmp_reg, tmp_reg);
-         spe_fcgt(gen->f, d_reg, tmp_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, tmp_reg, get_const_one_reg(gen), d_reg);
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
          spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
@@ -1148,6 +1254,7 @@ emit_instruction(struct codegen *gen,
 {
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
       return emit_MOV(gen, inst);
    case TGSI_OPCODE_MUL:
       return emit_MUL(gen, inst);
@@ -1165,6 +1272,8 @@ emit_instruction(struct codegen *gen,
       return emit_DP4(gen, inst);
    case TGSI_OPCODE_DPH:
       return emit_DPH(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
    case TGSI_OPCODE_RCP:
       return emit_RCP(gen, inst);
    case TGSI_OPCODE_RSQ:
@@ -1189,6 +1298,8 @@ emit_instruction(struct codegen *gen,
       return emit_MAX(gen, inst);
    case TGSI_OPCODE_MIN:
       return emit_MIN(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
    case TGSI_OPCODE_FLR:
       return emit_FLR(gen, inst);
    case TGSI_OPCODE_FRC:
-- 
cgit v1.2.3


From 6642380841b8cc0d166bf1c6a76be786e1c50825 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 22 Sep 2008 14:33:53 -0600
Subject: cell: Fixed bug with absolute, negate, set-negative logic in source
 fetch for TGSI instructions.   The logic should operate on the origin channel
 not the swizzled channel.  Please enter the commit message for your changes.

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 4b8189207d3..8972b5b1ea9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -184,31 +184,27 @@ get_src_reg(struct codegen *gen,
    assert(swizzle >= TGSI_SWIZZLE_X);
    assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
 
-   channel = swizzle;
-
    switch (src->SrcRegister.File) {
    case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[src->SrcRegister.Index][channel];
+      reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
       break;
    case TGSI_FILE_INPUT:
       {
-         if(channel == TGSI_EXTSWIZZLE_ONE)
+         if(swizzle == TGSI_EXTSWIZZLE_ONE)
          {
             /* Load const one float and early out */
             reg = get_const_one_reg(gen);
-            return reg;
          }
-         else if(channel == TGSI_EXTSWIZZLE_ZERO)
+         else if(swizzle == TGSI_EXTSWIZZLE_ZERO)
          {
             /* Load const zero float and early out */
             reg = get_itemp(gen);
             spe_xor(gen->f, reg, reg, reg);
-            return reg;
          }
          else
          {
             /* offset is measured in quadwords, not bytes */
-            int offset = src->SrcRegister.Index * 4 + channel;
+            int offset = src->SrcRegister.Index * 4 + swizzle;
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
@@ -217,7 +213,7 @@ get_src_reg(struct codegen *gen,
       }
       break;
    case TGSI_FILE_IMMEDIATE:
-      reg = gen->imm_regs[src->SrcRegister.Index][channel];
+      reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
       break;
    case TGSI_FILE_CONSTANT:
       /* xxx fall-through for now / fix */
-- 
cgit v1.2.3


From 1c79cf15c48e51cb5cf790f44214ae6aaf78c69b Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Tue, 23 Sep 2008 10:11:59 -0600
Subject: CELL: fix colormask code generation

The colormask code generation had assumed that its input packed pixels were
in RGBA format.  In fact, the format they're in is dependent on the
pipe color format.

Now the color format is passed in to gen_colormask(), and proper
color format-dependent SPU code is generated.
---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 161 +++++++++++------------
 1 file changed, 78 insertions(+), 83 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 1837b4c79bd..3b166e446d6 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -971,87 +971,6 @@ gen_logicop(const struct pipe_blend_state *blend,
 }
 
 
-static void
-gen_colormask(uint colormask,
-              struct spe_function *f,
-              int fragRGBA_reg, int fbRGBA_reg)
-{
-   /* We've got four 32-bit RGBA packed pixels in each of
-    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
-    * reds, greens, blues, and alphas.
-    * */
-
-   /* The color mask operation can prevent any set of color
-    * components in the incoming fragment from being written to the frame 
-    * buffer; we do this by replacing the masked components of the 
-    * fragment with the frame buffer values.
-    *
-    * There are only 16 possibilities, with a unique mask for
-    * each of the possibilities.  (Technically, there are only 15
-    * possibilities, since we shouldn't be called for the one mask
-    * that does nothing, but the complete implementation is here
-    * anyway to avoid confusion.)
-    *
-    * We implement this via a constant static array which we'll index 
-    * into to get the correct mask.
-    * 
-    * We're dependent on the mask values being low-order bits,
-    * with particular values for each bit; so we start with a
-    * few assertions, which will fail if any of the values were
-    * to change.
-    */
-   ASSERT(PIPE_MASK_R == 0x1);
-   ASSERT(PIPE_MASK_G == 0x2);
-   ASSERT(PIPE_MASK_B == 0x4);
-   ASSERT(PIPE_MASK_A == 0x8);
-
-   /* Here's the list of all possible colormasks, indexed by the
-    * value of the combined mask specifier.
-    */
-   static const unsigned int colormasks[16] = {
-      0x00000000, /* 0: all colors masked */
-      0xff000000, /* 1: PIPE_MASK_R */
-      0x00ff0000, /* 2: PIPE_MASK_G */
-      0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
-      0x0000ff00, /* 4: PIPE_MASK_B */
-      0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
-      0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
-      0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
-      0x000000ff, /* 8: PIPE_MASK_A */
-      0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
-      0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
-      0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
-      0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
-      0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
-      0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
-      0xffffffff  /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
-   };
-
-   /* Get a temporary register to hold the mask */
-   int colormask_reg = spe_allocate_available_register(f);
-
-   /* Look up the desired mask directly and load it into the mask register.
-    * This will load the same mask into each of the four words in the
-    * mask register.
-    */
-   spe_load_uint(f, colormask_reg, colormasks[colormask]);
-
-   /* Use the mask register to select between the fragment color
-    * values and the frame buffer color values.  Wherever the
-    * mask has a 0 bit, the current frame buffer color should override
-    * the fragment color.  Wherever the mask has a 1 bit, the 
-    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
-    * instruction will select bits from its first operand rA wherever the
-    * the mask bits rM are 0, and from its second operand rB wherever the
-    * mask bits rM are 1.  That means that the frame buffer color is the
-    * first operand, and the fragment color the second.
-    */
-    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
-
-    /* Release the temporary register and we're done */
-    spe_release_register(f, colormask_reg);
-}
-
 /**
  * Generate code to pack a quad of float colors into four 32-bit integers.
  *
@@ -1118,8 +1037,85 @@ gen_pack_colors(struct spe_function *f,
    spe_release_register(f, ba_reg);
 }
 
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   unsigned int r_mask;
+   unsigned int g_mask;
+   unsigned int b_mask;
+   unsigned int a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
 
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied to the fragment */
+   int colormask_reg = spe_allocate_available_register(f);
 
+   /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+    * masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
 
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
@@ -1383,7 +1379,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
-         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
 
@@ -1407,7 +1403,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
    spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
 
-
    spe_release_register(f, fbRGBA_reg);
    spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
-- 
cgit v1.2.3


From f5127909fb0386c2c11a2c26886eb02808ed514e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:32:09 -0600
Subject: cell: inst reorder to save a cycle

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 3b166e446d6..a353756c711 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1196,8 +1196,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
-      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
       spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
       spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
-- 
cgit v1.2.3


From 164fb1299e1614ce05ae539d832567469eedb402 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:38:40 -0600
Subject: cell: checkpoint: support for function calls in SPU shaders

Will be used for instructions like SIN/COS/POW/TEX/etc.  The PPU needs to
know the address of some functions in the SPU address space.  Send that
info to the PPU/main memory rather than patch up shaders on the SPU side.
Not finished/tested yet...
---
 src/gallium/drivers/cell/common.h           |  18 ++++-
 src/gallium/drivers/cell/ppu/cell_context.h |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c  |  81 ++++++++++++++++++++-
 src/gallium/drivers/cell/ppu/cell_spu.c     |   8 +++
 src/gallium/drivers/cell/spu/Makefile       |   3 +-
 src/gallium/drivers/cell/spu/spu_funcs.c    | 106 ++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_funcs.h    |  35 +++++++++
 src/gallium/drivers/cell/spu/spu_main.c     |   5 ++
 8 files changed, 254 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_funcs.h

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index f0ff96eb478..99329fd8e22 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,7 +130,7 @@ struct cell_command_fragment_ops
 #define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
 
 /**
- * Command to send a fragment progra to SPUs.
+ * Command to send a fragment program to SPUs.
  */
 struct cell_command_fragment_program
 {
@@ -267,6 +267,20 @@ struct cell_command
 } ALIGN16_ATTRIB;
 
 
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
 /** This is the object passed to spe_create_thread() */
 struct cell_init_info
 {
@@ -278,6 +292,8 @@ struct cell_init_info
    /** Buffers for command batches, vertex/index data */
    ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 14914b9c6f8..a9ad84bb184 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -149,6 +149,7 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
 
    uint num_spus;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8972b5b1ea9..fd12af19cef 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -37,7 +37,7 @@
  * \author Brian Paul
  */
 
-
+#include <math.h>
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -64,6 +64,7 @@
  */
 struct codegen
 {
+   struct cell_context *cell;
    int inputs_reg;      /**< 1st function parameter */
    int outputs_reg;     /**< 2nd function parameter */
    int constants_reg;   /**< 3rd function parameter */
@@ -1076,6 +1077,76 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args)
+{
+   const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+   char comment[100];
+   uint addr;
+   int ch;
+
+   assert(num_args <= 2);
+
+   /* lookup function address */
+   {
+      uint i;
+      addr = 0;
+      for (i = 0; i < funcs->num; i++) {
+         if (strcmp(funcs->names[i], funcname) == 0) {
+            addr = funcs->addrs[i];
+         }
+      }
+      assert(addr && "spu function not found");
+   }
+
+   sprintf(comment, "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int s_regs[3];
+         uint a;
+         for (a = 0; a < num_args; a++) {
+            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         }
+
+         /* XXX not done */
+         (void) s_regs;
+         (void) d_reg;
+
+         spe_bisl(gen->f, SPE_REG_RA, addr, 0, 0); /* XXX untested! */
+
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1303,6 +1374,13 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_END:
       return emit_END(gen);
 
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2);
+
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
    case TGSI_OPCODE_ELSE:
@@ -1431,6 +1509,7 @@ cell_gen_fragment_program(struct cell_context *cell,
    struct codegen gen;
 
    memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
    gen.f = f;
 
    /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 9508227e298..df020c4146d 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -36,6 +36,7 @@
 #include "cell_spu.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 
 
@@ -131,6 +132,11 @@ cell_start_spus(struct cell_context *cell)
    ASSERT_ALIGN16(&cell_global.inits[0]);
    ASSERT_ALIGN16(&cell_global.inits[1]);
 
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
@@ -141,6 +147,8 @@ cell_start_spus(struct cell_context *cell)
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
       cell_global.spe_contexts[i] = spe_context_create(0, NULL);
       if (!cell_global.spe_contexts[i]) {
          fprintf(stderr, "spe_context_create() failed\n");
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 1ae0dfb8c10..c2db85247e0 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,9 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_main.c \
+	spu_funcs.c \
 	spu_dcache.c \
+	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
 	spu_texture.c \
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 00000000000..d1749565187
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <cos8_v.h>
+#include <sin8_v.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+
+
+#define M_PI 3.1415926
+
+
+static vector float
+spu_cos(vector float x)
+{
+   static const float scale = 1.0 / (2.0 * M_PI);
+   x = x * spu_splats(scale); /* normalize */
+   return _cos8_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   static const float scale = 1.0 / (2.0 * M_PI);
+   x = x * spu_splats(scale); /* normalize */
+   return _sin8_v(x);   /* 8-bit accuracy enough?? */
+}
+
+
+static void
+add_func(struct cell_spu_function_info *spu_functions,
+             const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   add_func(&funcs, "spu_cos", &spu_cos);
+   add_func(&funcs, "spu_sin", &spu_sin);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 00000000000..3adb6ae99f9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index b4d30228f7a..6ef65d5645d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "spu_funcs.h"
 #include "spu_main.h"
 #include "spu_render.h"
 #include "spu_per_fragment_op.h"
@@ -721,6 +722,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
 #if 0
    if (spu.init.id==0)
       spu_test_misc(spu.init.id);
-- 
cgit v1.2.3


From 6741739d1e7a2c66576b671a81eaf0c4b9737ec2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:48:17 -0600
Subject: cell: remove unneeded blend/depth_stencil subclasses

---
 src/gallium/drivers/cell/ppu/cell_context.h      | 33 ++----------------
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |  5 ++-
 src/gallium/drivers/cell/ppu/cell_pipe_state.c   | 44 +++++-------------------
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  5 ++-
 4 files changed, 15 insertions(+), 72 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index a9ad84bb184..3dc15c9233c 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -66,35 +66,6 @@ struct cell_fragment_shader_state
 };
 
 
-/**
- * Cell blend state atom, subclass of pipe_blend_state.
- */
-struct cell_blend_state
-{
-   struct pipe_blend_state base;
-
-   /**
-    * Generated code to perform alpha blending
-    */
-   struct spe_function code;
-};
-
-
-/**
- * Cell depth/stencil/alpha state atom, subclass of
- * pipe_depth_stencil_alpha_state.
- */
-struct cell_depth_stencil_alpha_state
-{
-   struct pipe_depth_stencil_alpha_state base;
-
-   /**
-    * Generated code to perform alpha, stencil, and depth testing on the SPE
-    */
-   struct spe_function code;
-};
-
-
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -104,10 +75,10 @@ struct cell_context
 
    struct cell_winsys *winsys;
 
-   const struct cell_blend_state *blend;
+   const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
    uint num_samplers;
-   const struct cell_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    const struct cell_vertex_shader_state *vs;
    const struct cell_fragment_shader_state *fs;
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index a353756c711..653afc235df 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1140,9 +1140,8 @@ gen_colormask(struct spe_function *f,
 void
 cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 {
-   const struct pipe_depth_stencil_alpha_state *dsa =
-      &cell->depth_stencil->base;
-   const struct pipe_blend_state *blend = &cell->blend->base;
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
    const struct pipe_blend_color *blend_color = &cell->blend_color;
    const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index ea820aca744..b545d2d6975 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -37,7 +37,6 @@
 #include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_texture.h"
-#include "cell_state_per_fragment.h"
 
 
@@ -45,13 +44,7 @@ static void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state));
-
-   (void) memcpy(cb, blend, sizeof(*blend));
-#if 0
-   cell_generate_alpha_blend(cb);
-#endif
-   return cb;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
@@ -62,7 +55,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
 
    draw_flush(cell->draw);
 
-   cell->blend = (struct cell_blend_state *) state;
+   cell->blend = (struct pipe_blend_state *) state;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -70,12 +63,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state)
 static void
 cell_delete_blend_state(struct pipe_context *pipe, void *blend)
 {
-   struct cell_blend_state *cb = (struct cell_blend_state *) blend;
-
-#if 0
-   spe_release_func(& cb->code);
-#endif
-   FREE(cb);
+   FREE(blend);
 }
 
 
@@ -97,43 +85,29 @@ cell_set_blend_color(struct pipe_context *pipe,
 
 static void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
-                 const struct pipe_depth_stencil_alpha_state *depth_stencil)
+                 const struct pipe_depth_stencil_alpha_state *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       MALLOC(sizeof(struct cell_depth_stencil_alpha_state));
-
-   (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil));
-#if 0
-   cell_generate_depth_stencil_test(cdsa);
-#endif
-   return cdsa;
+   return mem_dup(dsa, sizeof(*dsa));
 }
 
 
 static void
 cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
-                                    void *depth_stencil)
+                                    void *dsa)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->depth_stencil =
-       (struct cell_depth_stencil_alpha_state *) depth_stencil;
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
    cell->dirty |= CELL_NEW_DEPTH_STENCIL;
 }
 
 
 static void
-cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
 {
-   struct cell_depth_stencil_alpha_state *cdsa =
-       (struct cell_depth_stencil_alpha_state *) depth;
-
-#if 0
-   spe_release_func(& cdsa->code);
-#endif
-   FREE(cdsa);
+   FREE(dsa);
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 8a389cd6aae..f35893537bf 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -30,7 +30,6 @@
 #include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
-#include "cell_state_per_fragment.h"
 #include "cell_batch.h"
 #include "cell_texture.h"
 #include "draw/draw_context.h"
@@ -110,8 +109,8 @@ cell_emit_state(struct cell_context *cell)
       fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
       memcpy(&fops->code, spe_code.store,
              SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      fops->dsa = cell->depth_stencil->base;
-      fops->blend = cell->blend->base;
+      fops->dsa = *cell->depth_stencil;
+      fops->blend = *cell->blend;
 
       /* free codegen buffer */
       spe_release_func(&spe_code);
-- 
cgit v1.2.3


From b5303446a8683afdb3247f2aaf01b6df2cb7d280 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:53:03 -0600
Subject: cell: asst clean-up, var renaming

---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index b545d2d6975..8c55b8e0933 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -49,13 +49,13 @@ cell_create_blend_state(struct pipe_context *pipe,
 
 
 static void
-cell_bind_blend_state(struct pipe_context *pipe, void *state)
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
    draw_flush(cell->draw);
 
-   cell->blend = (struct pipe_blend_state *) state;
+   cell->blend = (struct pipe_blend_state *) blend;
    cell->dirty |= CELL_NEW_BLEND;
 }
 
@@ -169,24 +169,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe,
 
 static void *
 cell_create_rasterizer_state(struct pipe_context *pipe,
-                             const struct pipe_rasterizer_state *setup)
+                             const struct pipe_rasterizer_state *rasterizer)
 {
-   struct pipe_rasterizer_state *state
-      = MALLOC(sizeof(struct pipe_rasterizer_state));
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rasterizer, sizeof(*rasterizer));
 }
 
 
 static void
-cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup)
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
 {
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
    struct cell_context *cell = cell_context(pipe);
 
    /* pass-through to draw module */
-   draw_set_rasterizer_state(cell->draw, setup);
+   draw_set_rasterizer_state(cell->draw, rasterizer);
 
-   cell->rasterizer = (struct pipe_rasterizer_state *)setup;
+   cell->rasterizer = rasterizer;
 
    cell->dirty |= CELL_NEW_RASTERIZER;
 }
-- 
cgit v1.2.3


From bac5900a14b85a6513fae7eef19a5ed1d26b2011 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 09:58:17 -0600
Subject: cell: align instruction buffers to 8-byte, not 32-byte boundary

---
 src/gallium/drivers/cell/spu/spu_main.c | 4 ++--
 src/gallium/drivers/cell/spu/spu_main.h | 8 ++++----
 src/gallium/include/pipe/p_compiler.h   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 6ef65d5645d..8f3e3785c17 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -706,8 +706,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 32 == 0);
-   ASSERT(((unsigned long) &spu.fragment_program_code) % 32 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 72e540fcff2..29a305232ec 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -143,13 +143,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
-   /** Current fragment ops machine code, at 32-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_ops_func fragment_ops;
 
-   /** Current fragment program machine code, at 32-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN32_ATTRIB;
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 1e702c7fa74..7bcebd3d6b6 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -144,12 +144,12 @@ typedef unsigned char boolean;
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
-#define ALIGN32_ATTRIB  __attribute__(( aligned( 32 ) ))
+#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
-#define ALIGN32_ATTRIB
+#define ALIGN8_ATTRIB
 #endif
 
 
-- 
cgit v1.2.3


From a1189ea882714282b884d37e530cd638dd4ca660 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:00:14 -0600
Subject: cell: move really_clear_tiles()

---
 src/gallium/drivers/cell/spu/spu_main.c | 38 ---------------------------------
 src/gallium/drivers/cell/spu/spu_tile.c | 37 ++++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_tile.h |  6 ++++--
 3 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 8f3e3785c17..b45e79a30b1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -117,44 +117,6 @@ release_buffer(uint buffer)
 }
 
 
-/**
- * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
- * tiles back to the main framebuffer.
- */
-static void
-really_clear_tiles(uint surfaceIndex)
-{
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
-   if (surfaceIndex == 0) {
-      clear_c_tile(&spu.ctile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         }
-      }
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
-}
-
-
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 216a33126b7..6905015a483 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
            0  /* rid */);
 }
 
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
index 1b5491112db..7bfb52be8f3 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.h
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -36,12 +36,14 @@
 
 
-void
+extern void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
 
-void
+extern void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 
+extern void
+really_clear_tiles(uint surfaceIndex);
 
 
 static INLINE void
-- 
cgit v1.2.3


From f45d39fa34ca36839c684fdcadd1476360de3a63 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:02:58 -0600
Subject: cell: move debug macros into new spu_debug.h

---
 src/gallium/drivers/cell/spu/spu_debug.h | 60 ++++++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.c  | 30 ++--------------
 2 files changed, 63 insertions(+), 27 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_debug.h

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
new file mode 100644
index 00000000000..bbe5889c4b3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_DEBUG_H
+#define SPU_DEBUG_H
+
+
+/* Set to 0 to disable all extraneous debugging code */
+#define DEBUG 1
+
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define DEBUG_PRINTF(format,...) \
+   if (Debug) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+
+#else
+
+#define DEBUG_PRINTF(...)
+#define D_PRINTF(...)
+
+#endif
+
+
+#endif /* SPU_DEBUG_H */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index b45e79a30b1..ea01728824f 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -32,6 +32,8 @@
 #include <stdio.h>
 #include <libmisc.h>
 
+#include "pipe/p_defines.h"
+
 #include "spu_funcs.h"
 #include "spu_main.h"
 #include "spu_render.h"
@@ -41,8 +43,8 @@
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
 #include "spu_dcache.h"
+#include "spu_debug.h"
 #include "cell/common.h"
-#include "pipe/p_defines.h"
 
 
 /*
@@ -51,32 +53,6 @@ helpful headers:
 /opt/cell/sdk/usr/include/libmisc.h
 */
 
-/* Set to 0 to disable all extraneous debugging code */
-#define DEBUG 1
-
-#if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
-
-/* These debug macros use the unusual construction ", ##__VA_ARGS__"
- * which expands to the expected comma + args if variadic arguments
- * are supplied, but swallows the comma if there are no variadic
- * arguments (which avoids syntax errors that would otherwise occur).
- */
-#define DEBUG_PRINTF(format,...) \
-   if (Debug) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-#define D_PRINTF(flag, format,...) \
-   if (spu.init.debug_flags & (flag)) \
-      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
-
-#else
-
-#define DEBUG_PRINTF(...)
-#define D_PRINTF(...)
-
-#endif
-
 struct spu_global spu;
 
 struct spu_vs_context draw;
-- 
cgit v1.2.3


From bb01c1a78eefeea6bc756d837fdd063660ac0230 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:10:08 -0600
Subject: cell: move debug-related declarations

---
 src/gallium/drivers/cell/spu/spu_debug.h | 4 ++--
 src/gallium/drivers/cell/spu/spu_main.c  | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h
index bbe5889c4b3..eeec0526558 100644
--- a/src/gallium/drivers/cell/spu/spu_debug.h
+++ b/src/gallium/drivers/cell/spu/spu_debug.h
@@ -34,8 +34,8 @@
 #define DEBUG 1
 
 #if DEBUG
-boolean Debug = FALSE;
-boolean force_fragment_ops_fallback = TRUE;
+extern boolean Debug;
+extern boolean force_fragment_ops_fallback;
 
 /* These debug macros use the unusual construction ", ##__VA_ARGS__"
  * which expands to the expected comma + args if variadic arguments
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index ea01728824f..bc94674fe82 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,6 +58,12 @@ struct spu_global spu;
 struct spu_vs_context draw;
 
 
+#if DEBUG
+boolean Debug = FALSE;
+boolean force_fragment_ops_fallback = TRUE;
+#endif
+
+
 /**
  * Buffers containing dynamically generated SPU code:
  */
-- 
cgit v1.2.3


From 9d00cd3fc726a3fe01b98fd222dd4c71b3e95d44 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 10:15:11 -0600
Subject: cell: move command processing code into new spu_command.c file

---
 src/gallium/drivers/cell/spu/Makefile      |   3 +-
 src/gallium/drivers/cell/spu/spu_command.c | 599 +++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_command.h |   7 +
 src/gallium/drivers/cell/spu/spu_main.c    | 558 +--------------------------
 4 files changed, 611 insertions(+), 556 deletions(-)
 create mode 100644 src/gallium/drivers/cell/spu/spu_command.c
 create mode 100644 src/gallium/drivers/cell/spu/spu_command.h

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index c2db85247e0..116453b79c5 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -16,8 +16,9 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 
 SOURCES = \
-	spu_funcs.c \
+	spu_command.c \
 	spu_dcache.c \
+	spu_funcs.c \
 	spu_main.c \
 	spu_per_fragment_op.c \
 	spu_render.c \
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 00000000000..ec9da5d8870
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,599 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "spu_debug.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   DEBUG_PRINTF("CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   static int warned = 0;
+
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info (for fallback case only) */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Parity twist!  For now, always use the fallback code by default,
+    * only switching to codegen when specifically requested.  This
+    * allows us to develop freely without risking taking down the
+    * branch.
+    *
+    * Later, the parity of this check will be reversed, so that
+    * codegen is *always* used, unless we specifically indicate that
+    * we don't want it.
+    *
+    * Eventually, the option will be removed completely, because in
+    * final code we'll always use codegen and won't even provide the
+    * raw state records that the fallback code requires.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
+      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+   }
+   else {
+      /* otherwise, the default fallback code remains in place */
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+   }
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+
+   spu.sampler[sampler->unit] = sampler->state;
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
+   else
+      spu.sample_texture[sampler->unit] = sample_texture_nearest;
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   const uint width = texture->width;
+   const uint height = texture->height;
+
+   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
+             texture->unit, texture->start,
+             texture->width, texture->height);
+
+   spu.texture[unit].start = texture->start;
+   spu.texture[unit].width = width;
+   spu.texture[unit].height = height;
+
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
+   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
+   spu.texture[unit].tex_size_mask = (vector unsigned int)
+         { width - 1, height - 1, 0, 0 };
+   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
+   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   DEBUG_PRINTF("FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   DEBUG_PRINTF("release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   DEBUG_PRINTF("BATCH complete\n");
+}
+
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   struct cell_command cmd;
+   int exitFlag = 0;
+
+   DEBUG_PRINTF("Enter command loop\n");
+
+   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
+   ASSERT_ALIGN16(&cmd);
+
+   while (!exitFlag) {
+      unsigned opcode;
+      int tag = 0;
+
+      DEBUG_PRINTF("Wait for cmd...\n");
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+
+      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
+
+      /* command payload */
+      mfc_get(&cmd,  /* dest */
+              (unsigned int) spu.init.cmd, /* src */
+              sizeof(struct cell_command), /* bytes */
+              tag,
+              0, /* tid */
+              0  /* rid */);
+      wait_on_mask( 1 << tag );
+
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         DEBUG_PRINTF("EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+   }
+
+   DEBUG_PRINTF("Exit command loop\n");
+
+   spu_dcache_report();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 00000000000..853e9aa5498
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,7 @@
+
+
+
+extern void
+command_loop(void);
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index bc94674fe82..4becd0f92a4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -35,14 +35,11 @@
 #include "pipe/p_defines.h"
 
 #include "spu_funcs.h"
+#include "spu_command.h"
 #include "spu_main.h"
-#include "spu_render.h"
 #include "spu_per_fragment_op.h"
 #include "spu_texture.h"
-#include "spu_tile.h"
 //#include "spu_test.h"
-#include "spu_vertex_shader.h"
-#include "spu_dcache.h"
 #include "spu_debug.h"
 #include "cell/common.h"
 
@@ -55,8 +52,6 @@ helpful headers:
 
 struct spu_global spu;
 
-struct spu_vs_context draw;
-
 
 #if DEBUG
 boolean Debug = FALSE;
@@ -64,554 +59,6 @@ boolean force_fragment_ops_fallback = TRUE;
 #endif
 
 
-/**
- * Buffers containing dynamically generated SPU code:
- */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
-
-
-
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
-static void
-cmd_clear_surface(const struct cell_command_clear_surface *clear)
-{
-   DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
-
-   if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
-         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
-            (spu.init.id << 20) | (spu.init.id << 28);
-         spu.fb.color_clear_value ^= x;
-      }
-   }
-   else {
-      spu.fb.depth_clear_value = clear->value;
-   }
-
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-
-   /* Simply set all tiles' status to CLEAR.
-    * When we actually begin rendering into a tile, we'll initialize it to
-    * the clear value.  If any tiles go untouched during the frame,
-    * really_clear_tiles() will set them to the clear value.
-    */
-   if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
-   }
-   else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
-   }
-
-#else
-
-   /*
-    * This path clears the whole framebuffer to the clear color right now.
-    */
-
-   /*
-   printf("SPU: %s num=%d w=%d h=%d\n",
-          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
-   */
-
-   /* init a single tile to the clear value */
-   if (clear->surface == 0) {
-      clear_c_tile(&spu.ctile);
-   }
-   else {
-      clear_z_tile(&spu.ztile);
-   }
-
-   /* walk over my tiles, writing the 'clear' tile's data */
-   {
-      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-      uint i;
-      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-         uint tx = i % spu.fb.width_tiles;
-         uint ty = i / spu.fb.width_tiles;
-         if (clear->surface == 0)
-            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-         else
-            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      }
-   }
-
-   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
-      wait_on_mask(1 << TAG_SURFACE_CLEAR);
-   }
-
-#endif /* CLEAR_OPT */
-
-   DEBUG_PRINTF("CLEAR SURF done\n");
-}
-
-
-static void
-cmd_release_verts(const struct cell_command_release_verts *release)
-{
-   DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf);
-   ASSERT(release->vertex_buf != ~0U);
-   release_buffer(release->vertex_buf);
-}
-
-
-/**
- * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
- * This involves installing new fragment ops SPU code.
- * If this function is never called, we'll use a regular C fallback function
- * for fragment processing.
- */
-static void
-cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
-{
-   static int warned = 0;
-
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
-   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
-   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
-
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
-    */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
-      if (!warned) {
-         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
-         warned = 1;
-      }
-   }
-
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
-}
-
-
-static void
-cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
-{
-   DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_program_code, fp->code,
-          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
-#if 01
-   /* Point function pointer at new code */
-   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
-#endif
-}
-
-
-static void
-cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
-{
-   DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
-             cmd->width,
-             cmd->height,
-             cmd->color_start,
-             cmd->color_format,
-             cmd->depth_format);
-
-   ASSERT_ALIGN16(cmd->color_start);
-   ASSERT_ALIGN16(cmd->depth_start);
-
-   spu.fb.color_start = cmd->color_start;
-   spu.fb.depth_start = cmd->depth_start;
-   spu.fb.color_format = cmd->color_format;
-   spu.fb.depth_format = cmd->depth_format;
-   spu.fb.width = cmd->width;
-   spu.fb.height = cmd->height;
-   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
-   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
-
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z32_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0xffffffffu;
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      spu.fb.zsize = 4;
-      spu.fb.zscale = (float) 0x00ffffffu;
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      spu.fb.zsize = 2;
-      spu.fb.zscale = (float) 0xffffu;
-      break;
-   default:
-      spu.fb.zsize = 0;
-      break;
-   }
-}
-
-
-static void
-cmd_state_sampler(const struct cell_command_sampler *sampler)
-{
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
-
-   spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
-}
-
-
-static void
-cmd_state_texture(const struct cell_command_texture *texture)
-{
-   const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
-
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
-
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
-
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
-
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
-}
-
-
-static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
-{
-   DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
-   ASSERT(vinfo->num_attribs >= 1);
-   ASSERT(vinfo->num_attribs <= 8);
-   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
-}
-
-
-static void
-cmd_state_vs_array_info(const struct cell_array_info *vs_info)
-{
-   const unsigned attr = vs_info->attr;
-
-   ASSERT(attr < PIPE_MAX_ATTRIBS);
-   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
-   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.size[attr] = vs_info->size;
-   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
-   draw.vertex_fetch.dirty = 1;
-}
-
-
-static void
-cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
-{
-   mfc_get(attribute_fetch_code_buffer,
-           (unsigned int) code->base,  /* src */
-           code->size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   draw.vertex_fetch.code = attribute_fetch_code_buffer;
-}
-
-
-static void
-cmd_finish(void)
-{
-   DEBUG_PRINTF("FINISH\n");
-   really_clear_tiles(0);
-   /* wait for all outstanding DMAs to finish */
-   mfc_write_tag_mask(~0);
-   mfc_read_tag_status_all();
-   /* send mbox message to PPU */
-   spu_write_out_mbox(CELL_CMD_FINISH);
-}
-
-
-/**
- * Execute a batch of commands which was sent to us by the PPU.
- * See the cell_emit_state.c code to see where the commands come from.
- *
- * The opcode param encodes the location of the buffer and its size.
- */
-static void
-cmd_batch(uint opcode)
-{
-   const uint buf = (opcode >> 8) & 0xff;
-   uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
-   uint pos;
-
-   DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n",
-             buf, size, spu.init.buffers[buf]);
-
-   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   size = ROUNDUP16(size);
-
-   ASSERT_ALIGN16(spu.init.buffers[buf]);
-
-   mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.buffers[buf],  /* src */
-           size,
-           TAG_BATCH_BUFFER,
-           0, /* tid */
-           0  /* rid */);
-   wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-   /* Tell PPU we're done copying the buffer to local store */
-   DEBUG_PRINTF("release batch buf %u\n", buf);
-   release_buffer(buf);
-
-   /*
-    * Loop over commands in the batch buffer
-    */
-   for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
-      /*
-       * rendering commands
-       */
-      case CELL_CMD_CLEAR_SURFACE:
-         {
-            struct cell_command_clear_surface *clr
-               = (struct cell_command_clear_surface *) &buffer[pos];
-            cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
-         }
-         break;
-      case CELL_CMD_RENDER:
-         {
-            struct cell_command_render *render
-               = (struct cell_command_render *) &buffer[pos];
-            uint pos_incr;
-            cmd_render(render, &pos_incr);
-            pos += pos_incr;
-         }
-         break;
-      /*
-       * state-update commands
-       */
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_OPS:
-         {
-            struct cell_command_fragment_ops *fops
-               = (struct cell_command_fragment_ops *) &buffer[pos];
-            cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
-         {
-            struct cell_command_fragment_program *fp
-               = (struct cell_command_fragment_program *) &buffer[pos];
-            cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_SAMPLER:
-         {
-            struct cell_command_sampler *sampler
-               = (struct cell_command_sampler *) &buffer[pos];
-            cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_TEXTURE:
-         {
-            struct cell_command_texture *texture
-               = (struct cell_command_texture *) &buffer[pos];
-            cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
-         }
-         break;
-      case CELL_CMD_STATE_VERTEX_INFO:
-         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
-         break;
-      case CELL_CMD_STATE_VIEWPORT:
-         (void) memcpy(& draw.viewport, &buffer[pos+1],
-                       sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
-         break;
-      case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
-         pos += 2;
-         break;
-      case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
-         break;
-      case CELL_CMD_STATE_BIND_VS:
-#if 0
-         spu_bind_vertex_shader(&draw,
-                                (struct cell_shader_info *) &buffer[pos+1]);
-#endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
-         break;
-      case CELL_CMD_STATE_ATTRIB_FETCH:
-         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
-                                &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
-         break;
-      /*
-       * misc commands
-       */
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_RELEASE_VERTS:
-         {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
-         }
-         break;
-      case CELL_CMD_FLUSH_BUFFER_RANGE: {
-	 struct cell_buffer_range *br = (struct cell_buffer_range *)
-	     &buffer[pos+1];
-
-	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
-	 break;
-      }
-      default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
-         ASSERT(0);
-         break;
-      }
-   }
-
-   DEBUG_PRINTF("BATCH complete\n");
-}
-
-
-/**
- * Temporary/simple main loop for SPEs: Get a command, execute it, repeat.
- */
-static void
-main_loop(void)
-{
-   struct cell_command cmd;
-   int exitFlag = 0;
-
-   DEBUG_PRINTF("Enter main loop\n");
-
-   ASSERT((sizeof(struct cell_command) & 0xf) == 0);
-   ASSERT_ALIGN16(&cmd);
-
-   while (!exitFlag) {
-      unsigned opcode;
-      int tag = 0;
-
-      DEBUG_PRINTF("Wait for cmd...\n");
-
-      /* read/wait from mailbox */
-      opcode = (unsigned int) spu_read_in_mbox();
-
-      DEBUG_PRINTF("got cmd 0x%x\n", opcode);
-
-      /* command payload */
-      mfc_get(&cmd,  /* dest */
-              (unsigned int) spu.init.cmd, /* src */
-              sizeof(struct cell_command), /* bytes */
-              tag,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask( 1 << tag );
-
-      /*
-       * NOTE: most commands should be contained in a batch buffer
-       */
-
-      switch (opcode & CELL_CMD_OPCODE_MASK) {
-      case CELL_CMD_EXIT:
-         DEBUG_PRINTF("EXIT\n");
-         exitFlag = 1;
-         break;
-      case CELL_CMD_VS_EXECUTE:
-#if 0
-         spu_execute_vertex_shader(&draw, &cmd.vs);
-#endif
-         break;
-      case CELL_CMD_BATCH:
-         cmd_batch(opcode);
-         break;
-      default:
-         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
-      }
-
-   }
-
-   DEBUG_PRINTF("Exit main loop\n");
-
-   spu_dcache_report();
-}
-
-
-
 static void
 one_time_init(void)
 {
@@ -658,6 +105,7 @@ main(main_param_t speid, main_param_t argp)
    DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
 
+   /* get initialization data */
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
            sizeof(struct cell_init_info), /* bytes */
@@ -675,7 +123,7 @@ main(main_param_t speid, main_param_t argp)
       spu_test_misc(spu.init.id);
 #endif
 
-   main_loop();
+   command_loop();
 
    return 0;
 }
-- 
cgit v1.2.3


From 55b65d3b42b8ba1ea1c5b5549b4629f3b20e7a97 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 17:57:01 -0600
Subject: cell: stub-out sin/cos function bodies to avoid trashing caller's
 stack for now

---
 src/gallium/drivers/cell/spu/spu_funcs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index d1749565187..b57ad3f3b81 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -49,17 +49,27 @@
 static vector float
 spu_cos(vector float x)
 {
+#if 0
    static const float scale = 1.0 / (2.0 * M_PI);
    x = x * spu_splats(scale); /* normalize */
    return _cos8_v(x);
+#else
+   /* just pass-through to avoid trashing caller's stack */
+   return x;
+#endif
 }
 
 static vector float
 spu_sin(vector float x)
 {
+#if 0
    static const float scale = 1.0 / (2.0 * M_PI);
    x = x * spu_splats(scale); /* normalize */
    return _sin8_v(x);   /* 8-bit accuracy enough?? */
+#else
+   /* just pass-through to avoid trashing caller's stack */
+   return x;
+#endif
 }
 
 
-- 
cgit v1.2.3


From fe1c9872ae258b78f195c1885ddfc29d07d17cf6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 17:59:19 -0600
Subject: cell: checkpoint: more work in emit_function_call()

Simple function call works now, but we don't save/restore the caller's registers yet.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 45 ++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/cell')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index fd12af19cef..8d2d4f2a0f2 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1105,7 +1105,10 @@ emit_function_call(struct codegen *gen,
    uint addr;
    int ch;
 
-   assert(num_args <= 2);
+   /* XXX temporary value */
+   const int frameSize = 64; /* stack frame (activation record) size */
+
+   assert(num_args <= 3);
 
    /* lookup function address */
    {
@@ -1119,7 +1122,9 @@ emit_function_call(struct codegen *gen,
       assert(addr && "spu function not found");
    }
 
-   sprintf(comment, "CALL %s:", funcname);
+   addr /= 4; /* discard 2 least significant bits */
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
    for (ch = 0; ch < 4; ch++) {
@@ -1131,12 +1136,40 @@ emit_function_call(struct codegen *gen,
             s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
          }
 
-         /* XXX not done */
-         (void) s_regs;
-         (void) d_reg;
+         /* Basically:
+          * save registers on stack
+          * move parameters to registers 3, 4, 5...
+          * call function
+          * save return value (reg 3)
+          * restore registers from stack
+          */
+
+         /* XXX hack: load first function param */
+         spe_move(gen->f, 3, s_regs[0]);
+
+         /* save $lr on stack     # stqd $lr,16($sp) */
+         spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+         /* save stack pointer    # stqd $sp,-frameSize($sp) */
+         spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+
+         /* XXX save registers to stack here */
+
+         /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+
+         /* branch to function, save return addr */
+         spe_brasl(gen->f, SPE_REG_RA, addr);
+
+         /* restore stack pointer # ai $sp,$sp,frameSize */
+         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
+
+         /* XXX restore registers from stack here */
 
-         spe_bisl(gen->f, SPE_REG_RA, addr, 0, 0); /* XXX untested! */
+         /* restore $lr           # lqd $lr,16($sp) */
+         spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
 
+         /* XXX hack: save function's return value */
+         spe_move(gen->f, d_reg, 3);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
-- 
cgit v1.2.3