aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
authorBen Skeggs <[email protected]>2008-11-10 15:53:51 +1100
committerBen Skeggs <[email protected]>2008-11-10 15:53:51 +1100
commit32e6be6362e44609d36c2fb20a4c858f57c908fb (patch)
tree4ed99e93ef5f4a8bb51653917c911e04e42f5235 /src/gallium/drivers
parent92674bc8889e10e580c630cf85c106fa6eb34d7b (diff)
parent399da3a337932c6074a69ac73e711138271308eb (diff)
Merge remote branch 'origin/gallium-0.2' into gallium-0.2
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/cell/common.h2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_batch.c19
-rw-r--r--src/gallium/drivers/cell/ppu/cell_clear.c13
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_fence.c14
-rw-r--r--src/gallium/drivers/cell/ppu/cell_flush.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c103
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c332
-rw-r--r--src/gallium/drivers/cell/ppu/cell_spu.h1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.c172
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.h1
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c5
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c28
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h3
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c2
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c5
-rw-r--r--src/gallium/drivers/softpipe/sp_quad_output.c8
18 files changed, 497 insertions, 219 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 23fb0b0831d..87488ea2d70 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -122,7 +122,7 @@
#define CELL_DEBUG_CACHE (1 << 6)
/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+#define SPU_MAX_FRAGMENT_OPS_INSTS 128
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85a..962775cd335 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
const uint batch = cell->cur_batch;
const uint size = cell->buffer_size[batch];
struct cell_command_fence *fence_cmd;
+ struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+ uint i;
+
+ /* set fence status to emitted, not yet signalled */
+ for (i = 0; i < cell->num_spus; i++) {
+ fence->status[i][0] = CELL_FENCE_EMITTED;
+ }
ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
fence_cmd->opcode = CELL_CMD_FENCE;
- fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+ fence_cmd->fence = fence;
+
+ /* update batch buffer size */
+ cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+ assert(sizeof(struct cell_command_fence) % 8 == 0);
}
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
{
static boolean flushing = FALSE;
uint batch = cell->cur_batch;
- const uint size = cell->buffer_size[batch];
+ uint size = cell->buffer_size[batch];
uint spu, cmd_word;
assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
/* Before we use this batch buffer, make sure any fenced texture buffers
* are released.
*/
- if (cell->fenced_buffers[batch].head)
+ if (cell->fenced_buffers[batch].head) {
emit_fence(cell);
+ size = cell->buffer_size[batch];
+ }
flushing = TRUE;
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bbe..037635e4660 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
clr->surface = surfIndex;
clr->value = clearValue;
}
+
+ /* Technically, the surface's contents are now known and cleared,
+ * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But
+ * it turns out it's quite painful to recognize when any particular
+ * surface goes from PIPE_SURFACE_STATUS_CLEAR to
+ * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+ * the drawing commands could be operating on numerous draw buffers,
+ * which we'd have to iterate through to set all their stati...
+ * For now, we cheat a bit and set the surface's status to DEFINED
+ * right here. Later we should revisit this and set the status to
+ * CLEAR here, and find a better place to set the status to DEFINED.
+ */
+ ps->status = PIPE_SURFACE_STATUS_DEFINED;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf1..eb1397bb3fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
*/
struct cell_buffer_list
{
- struct cell_fence fence;
+ struct cell_fence fence ALIGN16_ATTRIB;
struct cell_buffer_node *head;
};
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b9..867b5dcaa09 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
cell_fence_init(struct cell_fence *fence)
{
uint i;
+ ASSERT_ALIGN16(fence->status);
for (i = 0; i < CELL_MAX_SPUS; i++) {
fence->status[i][0] = CELL_FENCE_IDLE;
}
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
{
uint i;
for (i = 0; i < cell->num_spus; i++) {
- //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
- if (fence->status[i][0] == CELL_FENCE_EMITTED)
+ if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
return FALSE;
+ /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
}
return TRUE;
}
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
while (!cell_fence_signalled(cell, fence)) {
usleep(10);
}
+
+#ifdef DEBUG
+ {
+ uint i;
+ for (i = 0; i < cell->num_spus; i++) {
+ assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+ }
+ }
+#endif
}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b720101..a64967b4b9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
flags |= CELL_FLUSH_WAIT;
}
- if (flags & PIPE_FLUSH_SWAPBUFFERS)
+ if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
flags |= CELL_FLUSH_WAIT;
draw_flush( cell->draw );
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d4d644d6e81..5c41b264ac8 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname)
/**
* Emit code to call a SPU function.
* Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
*/
static boolean
emit_function_call(struct codegen *gen,
const struct tgsi_full_instruction *inst,
- char *funcname, uint num_args)
+ char *funcname, uint num_args, boolean scalar)
{
const uint addr = lookup_function(gen->cell, funcname);
char comment[100];
- int ch;
+ int s_regs[3];
+ int func_called = FALSE;
+ uint a, ch;
+ int retval_reg = -1;
assert(num_args <= 3);
snprintf(comment, sizeof(comment), "CALL %s:", funcname);
spe_comment(gen->f, -4, comment);
+ if (scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+ }
+ /* we'll call the function, put the return value in this register,
+ * then replicate it across all write-enabled components in d_reg.
+ */
+ retval_reg = spe_allocate_available_register(gen->f);
+ }
+
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s_regs[3], d_reg;
+ int d_reg;
ubyte usedRegs[SPE_NUM_REGS];
- uint a, i, numUsed;
+ uint i, numUsed;
- for (a = 0; a < num_args; a++) {
- s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ if (!scalar) {
+ for (a = 0; a < num_args; a++) {
+ s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+ }
}
- d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- numUsed = spe_get_registers_used(gen->f, usedRegs);
- assert(numUsed < gen->frame_size / 16 - 2);
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* save registers to stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- int offset = 2 + i;
- spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
- }
+ if (!scalar || !func_called) {
+ /* for a scalar function, we'll really only call the function once */
- /* setup function arguments */
- for (a = 0; a < num_args; a++) {
- spe_move(gen->f, 3 + a, s_regs[a]);
- }
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
- /* branch to function, save return addr */
- spe_brasl(gen->f, SPE_REG_RA, addr);
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
- /* save function's return value */
- spe_move(gen->f, d_reg, 3);
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
- /* restore registers from stack */
- for (i = 0; i < numUsed; i++) {
- uint reg = usedRegs[i];
- if (reg != d_reg) {
- int offset = 2 + i;
- spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return value */
+ if (scalar)
+ spe_move(gen->f, retval_reg, 3);
+ else
+ spe_move(gen->f, d_reg, 3);
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg && reg != retval_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
}
+
+ func_called = TRUE;
+ }
+
+ if (scalar) {
+ spe_move(gen->f, d_reg, retval_reg);
}
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen,
}
}
+ if (scalar) {
+ spe_release_register(gen->f, retval_reg);
+ }
+
return true;
}
@@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen,
return emit_END(gen);
case TGSI_OPCODE_COS:
- return emit_function_call(gen, inst, "spu_cos", 1);
+ return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
case TGSI_OPCODE_SIN:
- return emit_function_call(gen, inst, "spu_sin", 1);
+ return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
case TGSI_OPCODE_POW:
- return emit_function_call(gen, inst, "spu_pow", 2);
+ return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
case TGSI_OPCODE_EXPBASE2:
- return emit_function_call(gen, inst, "spu_exp2", 1);
+ return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
case TGSI_OPCODE_LOGBASE2:
- return emit_function_call(gen, inst, "spu_log2", 1);
+ return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
case TGSI_OPCODE_TEX:
/* fall-through for now */
case TGSI_OPCODE_TXD:
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc7..d9c3ff3f4d0 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f,
* access to the Compare Immediate instructions where we don't in
* gen_depth_test(), which is what makes us very different.
*
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
* The return value in the stencil_pass_reg is a bitmask of valid
* fragments that also passed the stencil test. The bitmask of valid
- * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
*/
static void
gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
- unsigned int mask_reg, unsigned int fbS_reg,
+ unsigned int stencil_max_value,
+ unsigned int fragment_mask_reg, unsigned int fbS_reg,
unsigned int stencil_pass_reg)
{
/* Generate code that puts the set of passing fragments into the stencil_pass_reg
@@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
*/
switch (state->func) {
case PIPE_FUNC_EQUAL:
- /* stencil_pass = mask & (s == reference) */
- spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
case PIPE_FUNC_NOTEQUAL:
- /* stencil_pass = mask & ~(s == reference) */
- spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
case PIPE_FUNC_GREATER:
- /* stencil_pass = mask & (s > reference) */
- spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- case PIPE_FUNC_LESS: {
- /* stencil_pass = mask & (reference > s) */
- /* There's no convenient Compare Less Than Immediate instruction, so
- * we'll have to do this one the harder way, by loading a register and
- * comparing directly. Compare Logical Greater Than Word (clgt)
- * treats its operands as unsigned - no sign extension.
- */
- unsigned int tmp_reg = spe_allocate_available_register(f);
- spe_load_uint(f, tmp_reg, state->ref_value);
- spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
- spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
- spe_release_register(f, tmp_reg);
+ case PIPE_FUNC_LESS:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- }
case PIPE_FUNC_LEQUAL:
- /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
- spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s <= reference)
+ * = fragment_mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- case PIPE_FUNC_GEQUAL: {
- /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
- /* As above, we have to do this by loading a register */
- unsigned int tmp_reg = spe_allocate_available_register(f);
- spe_load_uint(f, tmp_reg, state->ref_value);
- spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
- spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
- spe_release_register(f, tmp_reg);
+ case PIPE_FUNC_GEQUAL:
+ if (state->value_mask == stencil_max_value) {
+ /* stencil_pass = fragment_mask & (s >= reference) ]
+ * = fragment_mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ }
+ else {
+ /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+ spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+ spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ spe_release_register(f, tmp_masked_stencil);
+ }
break;
- }
case PIPE_FUNC_NEVER:
- /* stencil_pass = mask & 0 = 0 */
+ /* stencil_pass = fragment_mask & 0 = 0 */
spe_load_uint(f, stencil_pass_reg, 0);
break;
case PIPE_FUNC_ALWAYS:
- /* stencil_pass = mask & 1 = mask */
- spe_move(f, stencil_pass_reg, mask_reg);
+ /* stencil_pass = fragment_mask & 1 = fragment_mask */
+ spe_move(f, stencil_pass_reg, fragment_mask_reg);
break;
}
/* The fragments that passed the stencil test are now in stencil_pass_reg.
- * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+ * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
*/
}
@@ -1282,7 +1352,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
/* Add Word Immediate computes rT = rA + 10-bit signed immediate */
spe_ai(f, newS_reg, fbS_reg, 1);
/* Select from the current value or the new value based on the equality test */
- spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
spe_release_register(f, equals_reg);
break;
@@ -1295,7 +1365,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
/* Add Word Immediate with a (-1) value works */
spe_ai(f, newS_reg, fbS_reg, -1);
/* Select from the current value or the new value based on the equality test */
- spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+ spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
spe_release_register(f, equals_reg);
break;
@@ -1534,15 +1604,28 @@ gen_stencil_depth_test(struct spe_function *f,
* meaning that we have to calculate the stencil values but do not
* need to mask them), we can avoid generating code. Don't forget
* that we need to consider backfacing stencil, if enabled.
+ *
+ * Note that if the backface stencil is *not* enabled, the backface
+ * stencil will have the same values as the frontface stencil.
*/
- if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
- /* Trivial: don't need to calculate stencil values, and don't need to
- * write them back to the framebuffer.
+ if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+ dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ /* No changes to any stencil values */
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+ /* All changes are writemasked out, so no need to calculate
+ * what those changes might be, and no need to write anything back.
*/
need_to_calculate_stencil_values = false;
need_to_writemask_stencil_values = false;
}
- else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+ else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
/* Still trivial, but a little less so. We need to write the stencil
* values, but we don't need to mask them.
*/
@@ -1583,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f,
*/
spe_comment(f, 0, "Running basic stencil test");
stencil_pass_reg = spe_allocate_available_register(f);
- gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+ gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
/* If two-sided stenciling is on, generate code to run the stencil
* test on the backfacing stencil as well, and combine the two results
@@ -1592,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f,
if (dsa->stencil[1].enabled) {
unsigned int temp_reg = spe_allocate_available_register(f);
spe_comment(f, 0, "Running backface stencil test");
- gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+ gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
spe_release_register(f, temp_reg);
}
@@ -1914,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
* Z and/or stencil. We'll also convert the incoming fragment Z
* value in fragZ_reg from a floating point value in [0.0..1.0] to
* an unsigned integer value with the appropriate resolution.
+ * Note that even if depth or stencil is *not* enabled, if it's
+ * present in the buffer, we pull it out and put it back later;
+ * otherwise, we can inadvertently destroy the contents of
+ * buffers we're not supposed to touch (e.g., if the user is
+ * clearing the depth buffer but not the stencil buffer, a
+ * quad of constant depth is drawn over the surface; the stencil
+ * buffer must be maintained).
*/
switch(zs_format) {
case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
case PIPE_FORMAT_X8Z24_UNORM:
- if (dsa->depth.enabled) {
- /* We need the Z part at least */
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* four 24-bit Z values in the low-order bits */
- spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- }
- if (dsa->stencil[0].enabled) {
- setup_optional_register(f, &fbS_reg_set, &fbS_reg);
- /* four 8-bit Z values in the high-order bits */
- spe_rotmi(f, fbS_reg, fbZS_reg, -24);
- }
- break;
+ /* Pull out both Z and stencil */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+ /* four 24-bit Z values in the low-order bits */
+ spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* four 8-bit stencil values in the high-order bits */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ break;
case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
case PIPE_FORMAT_Z24X8_UNORM:
- if (dsa->depth.enabled) {
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* shift by 8 to get the upper 24-bit values */
- spe_rotmi(f, fbS_reg, fbZS_reg, -8);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- }
- if (dsa->stencil[0].enabled) {
- setup_optional_register(f, &fbS_reg_set, &fbS_reg);
- /* 8-bit stencil in the low-order bits - mask them out */
- spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
- }
- break;
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+ /* shift by 8 to get the upper 24-bit values */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+ /* 8-bit stencil in the low-order bits - mask them out */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ break;
case PIPE_FORMAT_Z32_UNORM:
- if (dsa->depth.enabled) {
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* Copy over 4 32-bit values */
- spe_move(f, fbZ_reg, fbZS_reg);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 32-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- }
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 32-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
/* No stencil, so can't do anything there */
- break;
+ break;
case PIPE_FORMAT_Z16_UNORM:
- if (dsa->depth.enabled) {
- /* XXX Not sure this is correct, but it was here before, so we're
- * going with it for now
- */
- setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
- /* Copy over 4 32-bit values */
- spe_move(f, fbZ_reg, fbZS_reg);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 16-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
- }
+ /* XXX Not sure this is correct, but it was here before, so we're
+ * going with it for now
+ */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 16-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
/* No stencil */
- break;
default:
ASSERT(0); /* invalid format */
@@ -2035,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_comment(f, 0, "Store quad's depth/stencil values in tile");
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- if (fbS_reg_set && fbZ_reg_set) {
- spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
- }
- else if (fbS_reg_set) {
- spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- }
- else {
- spe_move(f, fbZS_reg, fbZ_reg);
- }
+ spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- if (fbS_reg_set && fbZ_reg_set) {
- spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
- }
- else if (fbS_reg_set) {
- spe_move(f, fbZS_reg, fbS_reg);
- }
- else {
- spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
- }
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
}
else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
- if (fbZ_reg_set) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
- }
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- if (fbZ_reg_set) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
- }
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
ASSERT(0); /* XXX to do */
@@ -2080,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
}
+ /* Don't need these any more */
release_optional_register(f, &fbZ_reg_set, fbZ_reg);
release_optional_register(f, &fbS_reg_set, fbS_reg);
}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index b633880c256..c93958a9ed5 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,7 +30,6 @@
#include <libspe2.h>
-#include <libmisc.h>
#include <pthread.h>
#include "cell/common.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9ac2f3bbb96..ae88d069122 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
* Authors:
* Keith Whitwell <[email protected]>
* Michel Dänzer <[email protected]>
+ * Brian Paul
*/
#include "pipe/p_context.h"
@@ -42,10 +43,9 @@
#include "cell_texture.h"
-/* Simple, maximally packed layout.
- */
-static unsigned minify( unsigned d )
+static unsigned
+minify(unsigned d)
{
return MAX2(1, d>>1);
}
@@ -212,6 +212,89 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
/**
+ * For Cell. Basically, rearrange the pixels/quads from this layout:
+ * +--+--+--+--+
+ * |p0|p1|p2|p3|....
+ * +--+--+--+--+
+ *
+ * to this layout:
+ * +--+--+
+ * |p0|p1|....
+ * +--+--+
+ * |p2|p3|
+ * +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+ int y, x;
+
+ for (y = 0; y < TILE_SIZE; y+=2) {
+ for (x = 0; x < TILE_SIZE; x+=2) {
+ int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+ tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+ tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+ tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+ tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+ }
+ }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout. 4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+ uint dst_stride, const uint *src)
+{
+ const uint tile_size2 = tile_size * tile_size;
+ const uint h_t = (h + tile_size - 1) / tile_size;
+ const uint w_t = (w + tile_size - 1) / tile_size;
+ uint *tile_buf;
+ uint it, jt; /* tile counters */
+ uint i, j; /* intra-tile counters */
+
+ dst_stride /= 4; /* convert from bytes to pixels */
+
+ tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+
+ /* loop over src tiles */
+ for (it = 0; it < h_t; it++) {
+ for (jt = 0; jt < w_t; jt++) {
+ /* start of src tile: */
+ const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+
+ twiddle_tile(tsrc, tile_buf);
+ tsrc = tile_buf;
+
+ /* compute size of this tile (may be smaller than tile_size) */
+ /* XXX note: a compiler bug was found here. That's why the code
+ * looks as it does.
+ */
+ uint tile_width = w - jt * tile_size;
+ tile_width = MIN2(tile_width, tile_size);
+ uint tile_height = h - it * tile_size;
+ tile_height = MIN2(tile_height, tile_size);
+
+ /* loop over texels in the tile */
+ for (i = 0; i < tile_height; i++) {
+ for (j = 0; j < tile_width; j++) {
+ uint dsti = it * tile_size + i;
+ uint dstj = jt * tile_size + j;
+ ASSERT(dsti < h);
+ ASSERT(dstj < w);
+ dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+ }
+ }
+ }
+ }
+
+ align_free(tile_buf);
+}
+
+
+/**
* Convert linear texture image data to tiled format for SPU usage.
*/
static void
@@ -230,6 +313,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
switch (ct->base.format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
{
int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -261,6 +345,51 @@ cell_twiddle_texture(struct pipe_screen *screen,
}
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+ struct pipe_surface *surface)
+{
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+ const uint texWidth = ct->base.width[level];
+ const uint texHeight = ct->base.height[level];
+ const void *map = pipe_buffer_map(screen, surface->buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+ switch (ct->base.format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8A8_UNORM:
+ {
+ int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+ int offset = surface->stride * texHeight * 4 * surface->face;
+ uint *dst;
+
+ if (!ct->untiled_data[level]) {
+ ct->untiled_data[level] =
+ align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+ }
+
+ dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+ untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+ surface->stride, src);
+ }
+ break;
+ default:
+ {
+ ct->untiled_data[level] = NULL;
+ printf("Cell: untwiddle unsupported texture format\n");
+ }
+ }
+
+ pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
static struct pipe_surface *
cell_get_tex_surface(struct pipe_screen *screen,
struct pipe_texture *pt,
@@ -294,13 +423,18 @@ cell_get_tex_surface(struct pipe_screen *screen,
ps->zslice = zslice;
if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
- ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
- ps->nblocksy *
- ps->stride;
+ ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+ ps->nblocksy *
+ ps->stride;
}
else {
- assert(face == 0);
- assert(zslice == 0);
+ assert(face == 0);
+ assert(zslice == 0);
+ }
+
+ if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+ /* convert from tiled to linear layout */
+ cell_untwiddle_texture(screen, ps);
}
}
return ps;
@@ -311,6 +445,15 @@ static void
cell_tex_surface_release(struct pipe_screen *screen,
struct pipe_surface **s)
{
+ struct cell_texture *ct = cell_texture((*s)->texture);
+ const uint level = (*s)->level;
+
+ if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+ {
+ align_free(ct->untiled_data[level]);
+ ct->untiled_data[level] = NULL;
+ }
+
/* XXX if done rendering to teximage, re-tile */
pipe_texture_reference(&(*s)->texture, NULL);
@@ -325,6 +468,10 @@ cell_surface_map(struct pipe_screen *screen,
unsigned flags)
{
ubyte *map;
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+
+ assert(ct);
if (flags & ~surface->usage) {
assert(0);
@@ -335,7 +482,14 @@ cell_surface_map(struct pipe_screen *screen,
if (map == NULL)
return NULL;
else
- return (void *) (map + surface->offset);
+ {
+ if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+ return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+ }
+ else {
+ return (void *) (map + surface->offset);
+ }
+ }
}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 2f5fe0dd1b7..7018b0c9bf7 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -52,6 +52,7 @@ struct cell_texture
struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
/** Mapped, tiled texture data */
void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+ void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
};
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea631..d726622d94f 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
CELL_FENCE_SIGNALLED};
uint *dst = (uint *) fence_cmd->fence;
dst += 4 * spu.init.id; /* main store/memory address, not local store */
-
+ ASSERT_ALIGN16(dst);
mfc_put((void *) &status, /* src in local memory */
(unsigned int) dst, /* dst in main memory */
sizeof(status), /* size */
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
}
}
- spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
- spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+ spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
}
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 3534b35000d..ff3d609d258 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,7 +38,9 @@
#include <math.h>
#include <cos14_v.h>
#include <sin14_v.h>
-#include <transpose_matrix4x4.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
#include "cell/common.h"
#include "spu_main.h"
@@ -68,37 +70,19 @@ spu_sin(vector float x)
static vector float
spu_pow(vector float x, vector float y)
{
- float z0 = powf(spu_extract(x,0), spu_extract(y,0));
- float z1 = powf(spu_extract(x,1), spu_extract(y,1));
- float z2 = powf(spu_extract(x,2), spu_extract(y,2));
- float z3 = powf(spu_extract(x,3), spu_extract(y,3));
- return (vector float) {z0, z1, z2, z3};
+ return _powf4(x, y);
}
static vector float
spu_exp2(vector float x)
{
- float z0 = powf(2.0f, spu_extract(x,0));
- float z1 = powf(2.0f, spu_extract(x,1));
- float z2 = powf(2.0f, spu_extract(x,2));
- float z3 = powf(2.0f, spu_extract(x,3));
- return (vector float) {z0, z1, z2, z3};
+ return _exp2f4(x);
}
static vector float
spu_log2(vector float x)
{
- /*
- * log_base_2(x) = log(x) / log(2)
- * 1.442695 = 1/log(2).
- */
- static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
- float z0 = logf(spu_extract(x,0));
- float z1 = logf(spu_extract(x,1));
- float z2 = logf(spu_extract(x,2));
- float z3 = logf(spu_extract(x,3));
- vector float v = (vector float) {z0, z1, z2, z3};
- return spu_mul(v, k);
+ return _log2f4(x);
}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be25..692790c9f3c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
tile_t ztile ALIGN16_ATTRIB;
/** Read depth/stencil tiles? */
- boolean read_depth;
- boolean read_stencil;
+ boolean read_depth_stencil;
/** Current tiles' status */
ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c95..7c225e2f27c 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
static INLINE void
get_cz_tiles(uint tx, uint ty)
{
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
//printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
wait_put_cz_tiles(void)
{
wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
wait_on_mask(1 << TAG_WRITE_TILE_Z);
}
}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b613..5f908159bbf 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
}
ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
- if (spu.read_depth) {
+ if (spu.read_depth_stencil) {
if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
/* wait for mfc_get() to complete */
//printf("SPU: %u: waiting for ztile\n", spu.init.id);
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 0111469405f..31908a517b7 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
#include "tgsi/tgsi_sse2.h"
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
#include "rtasm/rtasm_x86sse.h"
@@ -92,7 +92,8 @@ fs_sse_run( const struct sp_fragment_shader *base,
machine->Temps);
/* init kill mask */
- machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = 0x0;
+ tgsi_set_kill_mask(machine, 0x0);
+ tgsi_set_exec_mask(machine, 1, 1, 1, 1);
shader->func( machine->Inputs,
machine->Outputs,
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index d05e12d1d95..b7aac7f84a7 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -64,6 +64,14 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
for (i = 0; i < 4; i++) { /* loop over color chans */
tile->data.color[y][x][i] = quadColor[i][j];
}
+ if (0) {
+ debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+ quad->input.x0 + x,
+ quad->input.y0 + y,
+ quadColor[0][j],
+ quadColor[1][j],
+ quadColor[2][j]);
+ }
}
}
}