aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorBrian <[email protected]>2008-02-01 13:45:58 -0700
committerBrian <[email protected]>2008-02-01 14:04:55 -0700
commit59be082909de6021ec7d08476253bd4c9920e137 (patch)
tree0ae745d06ba803b94fba465d7ce11b3d7542274f /src
parentb108bea6b44c1abc6d61e3e47096e5122de89cd1 (diff)
Cell: implement Z16 and Z32 testing with SIMD instructions.
Diffstat (limited to 'src')
-rw-r--r--src/mesa/pipe/cell/spu/spu_tile.h3
-rw-r--r--src/mesa/pipe/cell/spu/spu_tri.c222
-rw-r--r--src/mesa/pipe/cell/spu/spu_ztest.h135
3 files changed, 163 insertions, 197 deletions
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 1f123a2b7b4..4b1ef2a4c8d 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,7 +42,8 @@
typedef union {
ushort t16[TILE_SIZE][TILE_SIZE];
uint t32[TILE_SIZE][TILE_SIZE];
- float4 f4[TILE_SIZE/2][TILE_SIZE/2];
+ vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+ vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
} tile_t;
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a32878d9178..a26a4f098da 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -39,18 +39,11 @@
#include "spu_tile.h"
#include "spu_tri.h"
+#include "spu_ztest.h"
-/*
- * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
- * to do Z testing/updating.
- */
-#define SIMD_Z 0
-#if SIMD_Z
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
typedef vector unsigned int mask_t;
-#else
-typedef uint mask_t;
-#endif
/**
@@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
}
-
-static unsigned int
-do_depth_test(int x, int y, unsigned int mask)
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
{
- static const float4 zscale16
- = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
- static const float4 zscale32
- = {.f={(float)0xffffffff,
- (float)0xffffffff,
- (float)0xffffffff,
- (float)0xffffffff}};
- int ix = x - setup.cliprect_minx;
- int iy = y - setup.cliprect_miny;
float4 zvals;
+ mask_t mask;
zvals.v = eval_z((float) x, (float) y);
@@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
cur_tile_status_z = TILE_STATUS_DIRTY;
}
-#if 0
- if (cur_tile_status_z == TILE_STATUS_CLEAR) {
- /* now, _really_ clear the tile */
- clear_z_tile(&ztile);
- }
- else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
- /* make sure we've got the tile from main mem */
- wait_on_mask(1 << TAG_READ_TILE_Z);
- }
- cur_tile_status_z = TILE_STATUS_DIRTY;
-#endif
-
if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
- zvals.v = spu_mul(zvals.v, zscale16.v);
- if (mask & MASK_TOP_LEFT) {
- uint z = (uint) zvals.f[0];
- if (z < ztile.t16[iy][ix])
- ztile.t16[iy][ix] = z;
- else
- mask &= ~MASK_TOP_LEFT;
- }
-
- if (mask & MASK_TOP_RIGHT) {
- uint z = (uint) zvals.f[1];
- if (z < ztile.t16[iy][ix+1])
- ztile.t16[iy][ix+1] = z;
- else
- mask &= ~MASK_TOP_RIGHT;
- }
-
- if (mask & MASK_BOTTOM_LEFT) {
- uint z = (uint) zvals.f[2];
- if (z < ztile.t16[iy+1][ix])
- ztile.t16[iy+1][ix] = z;
- else
- mask &= ~MASK_BOTTOM_LEFT;
- }
-
- if (mask & MASK_BOTTOM_RIGHT) {
- uint z = (uint) zvals.f[3];
- if (z < ztile.t16[iy+1][ix+1])
- ztile.t16[iy+1][ix+1] = z;
- else
- mask &= ~MASK_BOTTOM_RIGHT;
- }
+ int ix = (x - setup.cliprect_minx) / 4;
+ int iy = (y - setup.cliprect_miny) / 2;
+ mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
}
else {
- zvals.v = spu_mul(zvals.v, zscale32.v);
- ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
- if (mask & MASK_TOP_LEFT) {
- uint z = (uint) zvals.f[0];
- if (z < ztile.t32[iy][ix])
- ztile.t32[iy][ix] = z;
- else
- mask &= ~MASK_TOP_LEFT;
- }
-
- if (mask & MASK_TOP_RIGHT) {
- uint z = (uint) zvals.f[1];
- if (z < ztile.t32[iy][ix+1])
- ztile.t32[iy][ix+1] = z;
- else
- mask &= ~MASK_TOP_RIGHT;
- }
-
- if (mask & MASK_BOTTOM_LEFT) {
- uint z = (uint) zvals.f[2];
- if (z < ztile.t32[iy+1][ix])
- ztile.t32[iy+1][ix] = z;
- else
- mask &= ~MASK_BOTTOM_LEFT;
- }
-
- if (mask & MASK_BOTTOM_RIGHT) {
- uint z = (uint) zvals.f[3];
- if (z < ztile.t32[iy+1][ix+1])
- ztile.t32[iy+1][ix+1] = z;
- else
- mask &= ~MASK_BOTTOM_RIGHT;
- }
+ int ix = (x - setup.cliprect_minx) / 2;
+ int iy = (y - setup.cliprect_miny) / 2;
+ mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
}
-
- if (mask)
- cur_tile_status_z = TILE_STATUS_DIRTY;
-
return mask;
}
-
-
-static vector unsigned int
-do_depth_test_simd(int x, int y, vector unsigned int quadmask)
-{
- int ix = (x - setup.cliprect_minx) / 2;
- int iy = (y - setup.cliprect_miny) / 2;
- float4 zvals;
-
- vector unsigned int zmask;
-
- zvals.v = eval_z((float) x, (float) y);
-
- if (cur_tile_status_z == TILE_STATUS_CLEAR) {
- /* now, _really_ clear the tile */
- clear_z_tile(&ztile);
- }
- else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
- /* make sure we've got the tile from main mem */
- wait_on_mask(1 << TAG_READ_TILE_Z);
- }
- cur_tile_status_z = TILE_STATUS_DIRTY;
-
- /* XXX fetch Z value sooner to hide latency here */
- zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
- zmask = spu_and(zmask, quadmask);
-
- ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
- //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
-
- return zmask;
-}
-
-
/**
* Emit a quad (pass to next stage). No clipping is done.
*/
@@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
}
if (spu.depth_stencil.depth.enabled) {
-#if SIMD_Z
- mask = do_depth_test_simd(x, y, mask);
-#else
mask = do_depth_test(x, y, mask);
-#endif
}
-#if !SIMD_Z
- if (mask)
-#endif
- {
- if (cur_tile_status_c == TILE_STATUS_CLEAR) {
- /* now, _really_ clear the tile */
- clear_c_tile(&ctile);
- }
+ /* If any bits in mask are set... */
+ if (spu_extract(spu_orx(mask), 0)) {
-#if 0
if (cur_tile_status_c == TILE_STATUS_CLEAR) {
/* now, _really_ clear the tile */
clear_c_tile(&ctile);
- cur_tile_status_c = TILE_STATUS_DIRTY;
}
- else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
- /* make sure we've got the tile from main mem */
- wait_on_mask(1 << TAG_READ_TILE_COLOR);
- }
-#endif
cur_tile_status_c = TILE_STATUS_DIRTY;
-#if SIMD_Z
if (spu_extract(mask, 0))
ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
if (spu_extract(mask, 1))
@@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
if (spu_extract(mask, 3))
ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#elif 0
+
+#if 0
/* SIMD_Z with swizzled color buffer (someday) */
vector float icolors = *((vector float *) &colors);
ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
-
-#else
- if (mask & MASK_TOP_LEFT)
- ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
- if (mask & MASK_TOP_RIGHT)
- ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
- if (mask & MASK_BOTTOM_LEFT)
- ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
- if (mask & MASK_BOTTOM_RIGHT)
- ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
#endif
}
@@ -533,38 +381,20 @@ static INLINE int block( int x )
/**
* Compute mask which indicates which pixels in the 2x2 quad are actually inside
* the triangle's bounds.
- *
- * this is pretty nasty... may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
*/
-static mask_t calculate_mask( int x )
+static INLINE mask_t calculate_mask( int x )
{
-#if SIMD_Z
- uint m0, m1, m2, m3;
-
- m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
- m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
- m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
- m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
-
- return (vector unsigned int) {m0, m1, m2, m3};
-#else
- unsigned mask = 0x0;
-
- if (x >= setup.span.left[0] && x < setup.span.right[0])
- mask |= MASK_TOP_LEFT;
-
- if (x >= setup.span.left[1] && x < setup.span.right[1])
- mask |= MASK_BOTTOM_LEFT;
-
- if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0])
- mask |= MASK_TOP_RIGHT;
-
- if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1])
- mask |= MASK_BOTTOM_RIGHT;
-
+ /* This is a little tricky.
+ * Use & instead of && to avoid branches.
+ * Use negation to convert true/false to ~0/0 values.
+ */
+ mask_t mask;
+ mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0);
+ mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+ mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2);
+ mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
return mask;
-#endif
}
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 00000000000..5fefb151765
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals vector of four fragment zvalues as floats
+ * \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this
+ * contains the Z values for 2 quads, 8 pixels.
+ * \param x x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+ uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+ vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+ /* convert floats to uints in [0, 65535] */
+ zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+ zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */
+
+ /* XXX this conditional could be removed with a bit of work */
+ if (x & 1) {
+ /* convert zbuffer values from ushorts to uints */
+ /* gather lower four ushorts */
+ zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+ (vector unsigned int) *zbuf,
+ VEC_LITERAL(vector unsigned char,
+ ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11,
+ ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+ /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+ mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+ /* mask &= inMask */
+ mask = spu_and(mask, inMask);
+ /* zbuf = mask ? zval : zbuf */
+ zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+ /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+ *zbuf = (vector unsigned short)
+ spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+ VEC_LITERAL(vector unsigned char,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 2, 3, 6, 7, 10, 11, 14, 15));
+ }
+ else {
+ /* convert zbuffer values from ushorts to uints */
+ /* gather upper four ushorts */
+ zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+ (vector unsigned int) *zbuf,
+ VEC_LITERAL(vector unsigned char,
+ ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+ ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+ /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+ mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+ /* mask &= inMask */
+ mask = spu_and(mask, inMask);
+ /* zbuf = mask ? zval : zbuf */
+ zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+ /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+ *zbuf = (vector unsigned short)
+ spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+ VEC_LITERAL(vector unsigned char,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 24, 25, 26, 27, 28, 29, 30, 31));
+ }
+ return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+ vector unsigned int inMask)
+{
+ vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+ /* convert floats to uints in [0, 0xffffffff] */
+ zvals_ui4 = spu_convtu(zvals, 32);
+ /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+ mask = spu_cmpgt(zbuf, zvals_ui4);
+ /* mask &= inMask */
+ mask = spu_and(mask, inMask);
+ /* zbuf = mask ? zval : zbuf */
+ *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+ return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */