diff options
author | Ben Skeggs <[email protected]> | 2008-11-10 15:53:51 +1100 |
---|---|---|
committer | Ben Skeggs <[email protected]> | 2008-11-10 15:53:51 +1100 |
commit | 32e6be6362e44609d36c2fb20a4c858f57c908fb (patch) | |
tree | 4ed99e93ef5f4a8bb51653917c911e04e42f5235 /src/gallium/drivers/cell/ppu | |
parent | 92674bc8889e10e580c630cf85c106fa6eb34d7b (diff) | |
parent | 399da3a337932c6074a69ac73e711138271308eb (diff) |
Merge remote branch 'origin/gallium-0.2' into gallium-0.2
Diffstat (limited to 'src/gallium/drivers/cell/ppu')
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_batch.c | 19 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_clear.c | 13 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_context.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_fence.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_flush.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_gen_fp.c | 103 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 332 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_spu.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_texture.c | 172 | ||||
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_texture.h | 1 |
10 files changed, 473 insertions, 186 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c index 448b723d85a..962775cd335 100644 --- a/src/gallium/drivers/cell/ppu/cell_batch.c +++ b/src/gallium/drivers/cell/ppu/cell_batch.c @@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell) const uint batch = cell->cur_batch; const uint size = cell->buffer_size[batch]; struct cell_command_fence *fence_cmd; + struct cell_fence *fence = &cell->fenced_buffers[batch].fence; + uint i; + + /* set fence status to emitted, not yet signalled */ + for (i = 0; i < cell->num_spus; i++) { + fence->status[i][0] = CELL_FENCE_EMITTED; + } ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE); fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size); fence_cmd->opcode = CELL_CMD_FENCE; - fence_cmd->fence = &cell->fenced_buffers[batch].fence; + fence_cmd->fence = fence; + + /* update batch buffer size */ + cell->buffer_size[batch] = size + sizeof(struct cell_command_fence); + assert(sizeof(struct cell_command_fence) % 8 == 0); } @@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell) { static boolean flushing = FALSE; uint batch = cell->cur_batch; - const uint size = cell->buffer_size[batch]; + uint size = cell->buffer_size[batch]; uint spu, cmd_word; assert(!flushing); @@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell) /* Before we use this batch buffer, make sure any fenced texture buffers * are released. */ - if (cell->fenced_buffers[batch].head) + if (cell->fenced_buffers[batch].head) { emit_fence(cell); + size = cell->buffer_size[batch]; + } flushing = TRUE; diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c index c9c0c721bbe..037635e4660 100644 --- a/src/gallium/drivers/cell/ppu/cell_clear.c +++ b/src/gallium/drivers/cell/ppu/cell_clear.c @@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps, clr->surface = surfIndex; clr->value = clearValue; } + + /* Technically, the surface's contents are now known and cleared, + * so we could set the status to PIPE_SURFACE_STATUS_CLEAR. But + * it turns out it's quite painful to recognize when any particular + * surface goes from PIPE_SURFACE_STATUS_CLEAR to + * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because + * the drawing commands could be operating on numerous draw buffers, + * which we'd have to iterate through to set all their stati... + * For now, we cheat a bit and set the surface's status to DEFINED + * right here. Later we should revisit this and set the status to + * CLEAR here, and find a better place to set the status to DEFINED. + */ + ps->status = PIPE_SURFACE_STATUS_DEFINED; } diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 4491ae8cdf1..eb1397bb3fa 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -89,7 +89,7 @@ struct cell_buffer_node; */ struct cell_buffer_list { - struct cell_fence fence; + struct cell_fence fence ALIGN16_ATTRIB; struct cell_buffer_node *head; }; diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c index ffb3bea12b9..867b5dcaa09 100644 --- a/src/gallium/drivers/cell/ppu/cell_fence.c +++ b/src/gallium/drivers/cell/ppu/cell_fence.c @@ -38,6 +38,7 @@ void cell_fence_init(struct cell_fence *fence) { uint i; + ASSERT_ALIGN16(fence->status); for (i = 0; i < CELL_MAX_SPUS; i++) { fence->status[i][0] = CELL_FENCE_IDLE; } @@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell, { uint i; for (i = 0; i < cell->num_spus; i++) { - //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE); - if (fence->status[i][0] == CELL_FENCE_EMITTED) + if (fence->status[i][0] != CELL_FENCE_SIGNALLED) return FALSE; + /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/ } return TRUE; } @@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell, while (!cell_fence_signalled(cell, fence)) { usleep(10); } + +#ifdef DEBUG + { + uint i; + for (i = 0; i < cell->num_spus; i++) { + assert(fence->status[i][0] == CELL_FENCE_SIGNALLED); + } + } +#endif } diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c index 6596b720101..a64967b4b9e 100644 --- a/src/gallium/drivers/cell/ppu/cell_flush.c +++ b/src/gallium/drivers/cell/ppu/cell_flush.c @@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags, flags |= CELL_FLUSH_WAIT; } - if (flags & PIPE_FLUSH_SWAPBUFFERS) + if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE)) flags |= CELL_FLUSH_WAIT; draw_flush( cell->draw ); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index d4d644d6e81..5c41b264ac8 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname) /** * Emit code to call a SPU function. * Used to implement instructions like SIN/COS/POW/TEX/etc. + * If scalar, only the X components of the src regs are used, and the + * result is replicated across the dest register's XYZW components. */ static boolean emit_function_call(struct codegen *gen, const struct tgsi_full_instruction *inst, - char *funcname, uint num_args) + char *funcname, uint num_args, boolean scalar) { const uint addr = lookup_function(gen->cell, funcname); char comment[100]; - int ch; + int s_regs[3]; + int func_called = FALSE; + uint a, ch; + int retval_reg = -1; assert(num_args <= 3); snprintf(comment, sizeof(comment), "CALL %s:", funcname); spe_comment(gen->f, -4, comment); + if (scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]); + } + /* we'll call the function, put the return value in this register, + * then replicate it across all write-enabled components in d_reg. + */ + retval_reg = spe_allocate_available_register(gen->f); + } + for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s_regs[3], d_reg; + int d_reg; ubyte usedRegs[SPE_NUM_REGS]; - uint a, i, numUsed; + uint i, numUsed; - for (a = 0; a < num_args; a++) { - s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + if (!scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } } - d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - numUsed = spe_get_registers_used(gen->f, usedRegs); - assert(numUsed < gen->frame_size / 16 - 2); + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* save registers to stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - int offset = 2 + i; - spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); - } + if (!scalar || !func_called) { + /* for a scalar function, we'll really only call the function once */ - /* setup function arguments */ - for (a = 0; a < num_args; a++) { - spe_move(gen->f, 3 + a, s_regs[a]); - } + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); - /* branch to function, save return addr */ - spe_brasl(gen->f, SPE_REG_RA, addr); + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } - /* save function's return value */ - spe_move(gen->f, d_reg, 3); + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } - /* restore registers from stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - if (reg != d_reg) { - int offset = 2 + i; - spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return value */ + if (scalar) + spe_move(gen->f, retval_reg, 3); + else + spe_move(gen->f, d_reg, 3); + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg && reg != retval_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } } + + func_called = TRUE; + } + + if (scalar) { + spe_move(gen->f, d_reg, retval_reg); } store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); @@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen, } } + if (scalar) { + spe_release_register(gen->f, retval_reg); + } + return true; } @@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen, return emit_END(gen); case TGSI_OPCODE_COS: - return emit_function_call(gen, inst, "spu_cos", 1); + return emit_function_call(gen, inst, "spu_cos", 1, TRUE); case TGSI_OPCODE_SIN: - return emit_function_call(gen, inst, "spu_sin", 1); + return emit_function_call(gen, inst, "spu_sin", 1, TRUE); case TGSI_OPCODE_POW: - return emit_function_call(gen, inst, "spu_pow", 2); + return emit_function_call(gen, inst, "spu_pow", 2, TRUE); case TGSI_OPCODE_EXPBASE2: - return emit_function_call(gen, inst, "spu_exp2", 1); + return emit_function_call(gen, inst, "spu_exp2", 1, TRUE); case TGSI_OPCODE_LOGBASE2: - return emit_function_call(gen, inst, "spu_log2", 1); + return emit_function_call(gen, inst, "spu_log2", 1, TRUE); case TGSI_OPCODE_TEX: /* fall-through for now */ case TGSI_OPCODE_TXD: diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 4e1e53ecdc7..d9c3ff3f4d0 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f, * access to the Compare Immediate instructions where we don't in * gen_depth_test(), which is what makes us very different. * + * There's some added complexity if there's a non-trivial state->mask + * value; then stencil and reference both must be masked + * * The return value in the stencil_pass_reg is a bitmask of valid * fragments that also passed the stencil test. The bitmask of valid - * fragments that failed would be found in (mask_reg & ~stencil_pass_reg). + * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg). */ static void gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, - unsigned int mask_reg, unsigned int fbS_reg, + unsigned int stencil_max_value, + unsigned int fragment_mask_reg, unsigned int fbS_reg, unsigned int stencil_pass_reg) { /* Generate code that puts the set of passing fragments into the stencil_pass_reg @@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, */ switch (state->func) { case PIPE_FUNC_EQUAL: - /* stencil_pass = mask & (s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_NOTEQUAL: - /* stencil_pass = mask & ~(s == reference) */ - spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & ~(s == reference) */ + spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; case PIPE_FUNC_GREATER: - /* stencil_pass = mask & (s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_LESS: { - /* stencil_pass = mask & (reference > s) */ - /* There's no convenient Compare Less Than Immediate instruction, so - * we'll have to do this one the harder way, by loading a register and - * comparing directly. Compare Logical Greater Than Word (clgt) - * treats its operands as unsigned - no sign extension. - */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_LESS: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (reference > s) */ + /* There's no convenient Compare Less Than Immediate instruction, so + * we'll have to do this one the harder way, by loading a register and + * comparing directly. Compare Logical Greater Than Word (clgt) + * treats its operands as unsigned - no sign extension. + */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_LEQUAL: - /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */ - spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s <= reference) + * = fragment_mask & ~(s > reference) */ + spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + } + else { + /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */ + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - case PIPE_FUNC_GEQUAL: { - /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */ - /* As above, we have to do this by loading a register */ - unsigned int tmp_reg = spe_allocate_available_register(f); - spe_load_uint(f, tmp_reg, state->ref_value); - spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); - spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg); - spe_release_register(f, tmp_reg); + case PIPE_FUNC_GEQUAL: + if (state->value_mask == stencil_max_value) { + /* stencil_pass = fragment_mask & (s >= reference) ] + * = fragment_mask & ~(reference > s) */ + /* As above, we have to do this by loading a register */ + unsigned int tmp_reg = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value); + spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + } + else { + /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */ + unsigned int tmp_reg = spe_allocate_available_register(f); + unsigned int tmp_masked_stencil = spe_allocate_available_register(f); + spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask); + spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask); + spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil); + spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg); + spe_release_register(f, tmp_reg); + spe_release_register(f, tmp_masked_stencil); + } break; - } case PIPE_FUNC_NEVER: - /* stencil_pass = mask & 0 = 0 */ + /* stencil_pass = fragment_mask & 0 = 0 */ spe_load_uint(f, stencil_pass_reg, 0); break; case PIPE_FUNC_ALWAYS: - /* stencil_pass = mask & 1 = mask */ - spe_move(f, stencil_pass_reg, mask_reg); + /* stencil_pass = fragment_mask & 1 = fragment_mask */ + spe_move(f, stencil_pass_reg, fragment_mask_reg); break; } /* The fragments that passed the stencil test are now in stencil_pass_reg. - * The fragments that failed would be (mask_reg & ~stencil_pass_reg). + * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg). */ } @@ -1282,7 +1352,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate computes rT = rA + 10-bit signed immediate */ spe_ai(f, newS_reg, fbS_reg, 1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1295,7 +1365,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op, /* Add Word Immediate with a (-1) value works */ spe_ai(f, newS_reg, fbS_reg, -1); /* Select from the current value or the new value based on the equality test */ - spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg); + spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg); spe_release_register(f, equals_reg); break; @@ -1534,15 +1604,28 @@ gen_stencil_depth_test(struct spe_function *f, * meaning that we have to calculate the stencil values but do not * need to mask them), we can avoid generating code. Don't forget * that we need to consider backfacing stencil, if enabled. + * + * Note that if the backface stencil is *not* enabled, the backface + * stencil will have the same values as the frontface stencil. */ - if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) { - /* Trivial: don't need to calculate stencil values, and don't need to - * write them back to the framebuffer. + if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP && + dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) { + /* No changes to any stencil values */ + need_to_calculate_stencil_values = false; + need_to_writemask_stencil_values = false; + } + else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) { + /* All changes are writemasked out, so no need to calculate + * what those changes might be, and no need to write anything back. */ need_to_calculate_stencil_values = false; need_to_writemask_stencil_values = false; } - else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) { + else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) { /* Still trivial, but a little less so. We need to write the stencil * values, but we don't need to mask them. */ @@ -1583,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f, */ spe_comment(f, 0, "Running basic stencil test"); stencil_pass_reg = spe_allocate_available_register(f); - gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg); + gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg); /* If two-sided stenciling is on, generate code to run the stencil * test on the backfacing stencil as well, and combine the two results @@ -1592,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f, if (dsa->stencil[1].enabled) { unsigned int temp_reg = spe_allocate_available_register(f); spe_comment(f, 0, "Running backface stencil test"); - gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg); + gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg); spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg); spe_release_register(f, temp_reg); } @@ -1914,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) * Z and/or stencil. We'll also convert the incoming fragment Z * value in fragZ_reg from a floating point value in [0.0..1.0] to * an unsigned integer value with the appropriate resolution. + * Note that even if depth or stencil is *not* enabled, if it's + * present in the buffer, we pull it out and put it back later; + * otherwise, we can inadvertently destroy the contents of + * buffers we're not supposed to touch (e.g., if the user is + * clearing the depth buffer but not the stencil buffer, a + * quad of constant depth is drawn over the surface; the stencil + * buffer must be maintained). */ switch(zs_format) { case PIPE_FORMAT_S8Z24_UNORM: /* fall through */ case PIPE_FORMAT_X8Z24_UNORM: - if (dsa->depth.enabled) { - /* We need the Z part at least */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* four 24-bit Z values in the low-order bits */ - spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* four 8-bit Z values in the high-order bits */ - spe_rotmi(f, fbS_reg, fbZS_reg, -24); - } - break; + /* Pull out both Z and stencil */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* four 24-bit Z values in the low-order bits */ + spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* four 8-bit stencil values in the high-order bits */ + spe_rotmi(f, fbS_reg, fbZS_reg, -24); + break; case PIPE_FORMAT_Z24S8_UNORM: /* fall through */ case PIPE_FORMAT_Z24X8_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* shift by 8 to get the upper 24-bit values */ - spe_rotmi(f, fbS_reg, fbZS_reg, -8); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 24-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -8); - } - if (dsa->stencil[0].enabled) { - setup_optional_register(f, &fbS_reg_set, &fbS_reg); - /* 8-bit stencil in the low-order bits - mask them out */ - spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); - } - break; + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + setup_optional_register(f, &fbS_reg_set, &fbS_reg); + + /* shift by 8 to get the upper 24-bit values */ + spe_rotmi(f, fbS_reg, fbZS_reg, -8); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 24-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); + + /* 8-bit stencil in the low-order bits - mask them out */ + spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff); + break; case PIPE_FORMAT_Z32_UNORM: - if (dsa->depth.enabled) { - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 32-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - } + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 32-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); /* No stencil, so can't do anything there */ - break; + break; case PIPE_FORMAT_Z16_UNORM: - if (dsa->depth.enabled) { - /* XXX Not sure this is correct, but it was here before, so we're - * going with it for now - */ - setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); - /* Copy over 4 32-bit values */ - spe_move(f, fbZ_reg, fbZS_reg); - - /* Incoming fragZ_reg value is a float in 0.0...1.0; convert - * to a 16-bit unsigned integer - */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 32); - spe_rotmi(f, fragZ_reg, fragZ_reg, -16); - } + /* XXX Not sure this is correct, but it was here before, so we're + * going with it for now + */ + setup_optional_register(f, &fbZ_reg_set, &fbZ_reg); + /* Copy over 4 32-bit values */ + spe_move(f, fbZ_reg, fbZS_reg); + + /* Incoming fragZ_reg value is a float in 0.0...1.0; convert + * to a 16-bit unsigned integer + */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); /* No stencil */ - break; default: ASSERT(0); /* invalid format */ @@ -2035,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_comment(f, 0, "Store quad's depth/stencil values in tile"); if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ - } - else { - spe_move(f, fbZS_reg, fbZ_reg); - } + spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - if (fbS_reg_set && fbZ_reg_set) { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ - } - else if (fbS_reg_set) { - spe_move(f, fbZS_reg, fbS_reg); - } - else { - spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ - } + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } else if (zs_format == PIPE_FORMAT_Z32_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - if (fbZ_reg_set) { - spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ - } + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { ASSERT(0); /* XXX to do */ @@ -2080,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg); } + /* Don't need these any more */ release_optional_register(f, &fbZ_reg_set, fbZ_reg); release_optional_register(f, &fbS_reg_set, fbS_reg); } diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h index b633880c256..c93958a9ed5 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.h +++ b/src/gallium/drivers/cell/ppu/cell_spu.h @@ -30,7 +30,6 @@ #include <libspe2.h> -#include <libmisc.h> #include <pthread.h> #include "cell/common.h" diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c index 9ac2f3bbb96..ae88d069122 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.c +++ b/src/gallium/drivers/cell/ppu/cell_texture.c @@ -28,6 +28,7 @@ * Authors: * Keith Whitwell <[email protected]> * Michel Dänzer <[email protected]> + * Brian Paul */ #include "pipe/p_context.h" @@ -42,10 +43,9 @@ #include "cell_texture.h" -/* Simple, maximally packed layout. - */ -static unsigned minify( unsigned d ) +static unsigned +minify(unsigned d) { return MAX2(1, d>>1); } @@ -212,6 +212,89 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, /** + * For Cell. Basically, rearrange the pixels/quads from this layout: + * +--+--+--+--+ + * |p0|p1|p2|p3|.... + * +--+--+--+--+ + * + * to this layout: + * +--+--+ + * |p0|p1|.... + * +--+--+ + * |p2|p3| + * +--+--+ + */ +static void +twiddle_tile(const uint *tileIn, uint *tileOut) +{ + int y, x; + + for (y = 0; y < TILE_SIZE; y+=2) { + for (x = 0; x < TILE_SIZE; x+=2) { + int k = 4 * (y/2 * TILE_SIZE/2 + x/2); + tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k]; + tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1]; + tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2]; + tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3]; + } + } +} + + +/** + * Convert image from tiled layout to linear layout. 4-byte pixels. + */ +static void +untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst, + uint dst_stride, const uint *src) +{ + const uint tile_size2 = tile_size * tile_size; + const uint h_t = (h + tile_size - 1) / tile_size; + const uint w_t = (w + tile_size - 1) / tile_size; + uint *tile_buf; + uint it, jt; /* tile counters */ + uint i, j; /* intra-tile counters */ + + dst_stride /= 4; /* convert from bytes to pixels */ + + tile_buf = align_malloc(tile_size * tile_size * 4, 16); + + /* loop over src tiles */ + for (it = 0; it < h_t; it++) { + for (jt = 0; jt < w_t; jt++) { + /* start of src tile: */ + const uint *tsrc = src + (it * w_t + jt) * tile_size2; + + twiddle_tile(tsrc, tile_buf); + tsrc = tile_buf; + + /* compute size of this tile (may be smaller than tile_size) */ + /* XXX note: a compiler bug was found here. That's why the code + * looks as it does. + */ + uint tile_width = w - jt * tile_size; + tile_width = MIN2(tile_width, tile_size); + uint tile_height = h - it * tile_size; + tile_height = MIN2(tile_height, tile_size); + + /* loop over texels in the tile */ + for (i = 0; i < tile_height; i++) { + for (j = 0; j < tile_width; j++) { + uint dsti = it * tile_size + i; + uint dstj = jt * tile_size + j; + ASSERT(dsti < h); + ASSERT(dstj < w); + dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j]; + } + } + } + } + + align_free(tile_buf); +} + + +/** * Convert linear texture image data to tiled format for SPU usage. */ static void @@ -230,6 +313,7 @@ cell_twiddle_texture(struct pipe_screen *screen, switch (ct->base.format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: { int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; int offset = bufWidth * bufHeight * 4 * surface->face; @@ -261,6 +345,51 @@ cell_twiddle_texture(struct pipe_screen *screen, } +/** + * Convert SPU tiled texture image data to linear format for app usage. + */ +static void +cell_untwiddle_texture(struct pipe_screen *screen, + struct pipe_surface *surface) +{ + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + const uint texWidth = ct->base.width[level]; + const uint texHeight = ct->base.height[level]; + const void *map = pipe_buffer_map(screen, surface->buffer, + PIPE_BUFFER_USAGE_CPU_READ); + const uint *src = (const uint *) ((const ubyte *) map + surface->offset); + + switch (ct->base.format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_UNORM: + { + int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1; + int offset = surface->stride * texHeight * 4 * surface->face; + uint *dst; + + if (!ct->untiled_data[level]) { + ct->untiled_data[level] = + align_malloc(surface->stride * texHeight * 4 * numFaces, 16); + } + + dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset); + + untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst, + surface->stride, src); + } + break; + default: + { + ct->untiled_data[level] = NULL; + printf("Cell: untwiddle unsupported texture format\n"); + } + } + + pipe_buffer_unmap(screen, surface->buffer); +} + + static struct pipe_surface * cell_get_tex_surface(struct pipe_screen *screen, struct pipe_texture *pt, @@ -294,13 +423,18 @@ cell_get_tex_surface(struct pipe_screen *screen, ps->zslice = zslice; if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) { - ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * - ps->nblocksy * - ps->stride; + ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) * + ps->nblocksy * + ps->stride; } else { - assert(face == 0); - assert(zslice == 0); + assert(face == 0); + assert(zslice == 0); + } + + if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) { + /* convert from tiled to linear layout */ + cell_untwiddle_texture(screen, ps); } } return ps; @@ -311,6 +445,15 @@ static void cell_tex_surface_release(struct pipe_screen *screen, struct pipe_surface **s) { + struct cell_texture *ct = cell_texture((*s)->texture); + const uint level = (*s)->level; + + if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) + { + align_free(ct->untiled_data[level]); + ct->untiled_data[level] = NULL; + } + /* XXX if done rendering to teximage, re-tile */ pipe_texture_reference(&(*s)->texture, NULL); @@ -325,6 +468,10 @@ cell_surface_map(struct pipe_screen *screen, unsigned flags) { ubyte *map; + struct cell_texture *ct = cell_texture(surface->texture); + const uint level = surface->level; + + assert(ct); if (flags & ~surface->usage) { assert(0); @@ -335,7 +482,14 @@ cell_surface_map(struct pipe_screen *screen, if (map == NULL) return NULL; else - return (void *) (map + surface->offset); + { + if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) { + return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset); + } + else { + return (void *) (map + surface->offset); + } + } } diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h index 2f5fe0dd1b7..7018b0c9bf7 100644 --- a/src/gallium/drivers/cell/ppu/cell_texture.h +++ b/src/gallium/drivers/cell/ppu/cell_texture.h @@ -52,6 +52,7 @@ struct cell_texture struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS]; /** Mapped, tiled texture data */ void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS]; + void *untiled_data[CELL_MAX_TEXTURE_LEVELS]; }; |