diff options
Diffstat (limited to 'src/gallium/drivers')
24 files changed, 2959 insertions, 995 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h index cb0631baf52..99329fd8e22 100644 --- a/src/gallium/drivers/cell/common.h +++ b/src/gallium/drivers/cell/common.h @@ -104,11 +104,11 @@ #define CELL_BUFFER_STATUS_FREE 10 #define CELL_BUFFER_STATUS_USED 20 - -#define CELL_DEBUG_CHECKER (1 << 0) -#define CELL_DEBUG_SYNC (1 << 1) - - +#define CELL_DEBUG_CHECKER (1 << 0) +#define CELL_DEBUG_ASM (1 << 1) +#define CELL_DEBUG_SYNC (1 << 2) +#define CELL_DEBUG_FRAGMENT_OPS (1 << 3) +#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4) /** Max instructions for doing per-fragment operations */ #define SPU_MAX_FRAGMENT_OPS_INSTS 64 @@ -130,7 +130,7 @@ struct cell_command_fragment_ops #define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128 /** - * Command to send a fragment progra to SPUs. + * Command to send a fragment program to SPUs. */ struct cell_command_fragment_program { @@ -267,6 +267,20 @@ struct cell_command } ALIGN16_ATTRIB; +#define MAX_SPU_FUNCTIONS 12 +/** + * Used to tell the PPU about the address of particular functions in the + * SPU's address space. + */ +struct cell_spu_function_info +{ + uint num; + char names[MAX_SPU_FUNCTIONS][16]; + uint addrs[MAX_SPU_FUNCTIONS]; + char pad[12]; /**< Pad struct to multiple of 16 bytes (256 currently) */ +}; + + /** This is the object passed to spe_create_thread() */ struct cell_init_info { @@ -278,6 +292,8 @@ struct cell_init_info /** Buffers for command batches, vertex/index data */ ubyte *buffers[CELL_NUM_BUFFERS]; uint *buffer_status; /**< points at cell_context->buffer_status */ + + struct cell_spu_function_info *spu_functions; } ALIGN16_ATTRIB; diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 71f1a3049d1..62e213ea354 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -85,13 +85,14 @@ cell_draw_create(struct cell_context *cell) } -#ifdef DEBUG static const struct debug_named_value cell_debug_flags[] = { {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */ + {"asm", CELL_DEBUG_ASM}, /**< dump SPU asm code */ {"sync", CELL_DEBUG_SYNC}, /**< SPUs do synchronous DMA */ + {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/ + {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/ {NULL, 0} }; -#endif struct pipe_context * diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h index 14914b9c6f8..3dc15c9233c 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.h +++ b/src/gallium/drivers/cell/ppu/cell_context.h @@ -67,35 +67,6 @@ struct cell_fragment_shader_state /** - * Cell blend state atom, subclass of pipe_blend_state. - */ -struct cell_blend_state -{ - struct pipe_blend_state base; - - /** - * Generated code to perform alpha blending - */ - struct spe_function code; -}; - - -/** - * Cell depth/stencil/alpha state atom, subclass of - * pipe_depth_stencil_alpha_state. - */ -struct cell_depth_stencil_alpha_state -{ - struct pipe_depth_stencil_alpha_state base; - - /** - * Generated code to perform alpha, stencil, and depth testing on the SPE - */ - struct spe_function code; -}; - - -/** * Per-context state, subclass of pipe_context. */ struct cell_context @@ -104,10 +75,10 @@ struct cell_context struct cell_winsys *winsys; - const struct cell_blend_state *blend; + const struct pipe_blend_state *blend; const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS]; uint num_samplers; - const struct cell_depth_stencil_alpha_state *depth_stencil; + const struct pipe_depth_stencil_alpha_state *depth_stencil; const struct pipe_rasterizer_state *rasterizer; const struct cell_vertex_shader_state *vs; const struct cell_fragment_shader_state *fs; @@ -149,6 +120,7 @@ struct cell_context /** Mapped constant buffers */ void *mapped_constants[PIPE_SHADER_TYPES]; + struct cell_spu_function_info spu_functions ALIGN16_ATTRIB; uint num_spus; diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 6ffe94eb14a..8d2d4f2a0f2 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -37,7 +37,7 @@ * \author Brian Paul */ - +#include <math.h> #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" @@ -51,25 +51,38 @@ #include "cell_gen_fp.h" -/** Set to 1 to enable debug/disassembly printfs */ -#define DISASSEM 01 +#define MAX_TEMPS 16 +#define MAX_IMMED 8 +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 /** * Context needed during code generation. */ struct codegen { + struct cell_context *cell; int inputs_reg; /**< 1st function parameter */ int outputs_reg; /**< 2nd function parameter */ int constants_reg; /**< 3rd function parameter */ - int temp_regs[8][4]; /**< maps TGSI temps to SPE registers */ + int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */ + int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */ + + int num_imm; /**< number of immediates */ int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ /** Per-instruction temps / intermediate temps */ int num_itemps; - int itemps[3]; + int itemps[10]; + + /** Current IF/ELSE/ENDIF nesting level */ + int if_nesting; + /** Index of execution mask register */ + int exec_mask_reg; struct spe_function *f; boolean error; @@ -112,19 +125,47 @@ get_const_one_reg(struct codegen *gen) { if (gen->one_reg <= 0) { gen->one_reg = spe_allocate_available_register(gen->f); - } - /* one = {1.0, 1.0, 1.0, 1.0} */ - spe_load_float(gen->f, gen->one_reg, 1.0f); -#if DISASSEM - printf("il\tr%d, 1.0f\n", gen->one_reg); -#endif + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "INIT CONSTANT 1.0:"); + + /* one = {1.0, 1.0, 1.0, 1.0} */ + spe_load_float(gen->f, gen->one_reg, 1.0f); + + spe_indent(gen->f, -4); + } return gen->one_reg; } /** + * Return index of the pixel execution mask. + * The register is allocated an initialized upon the first call. + * + * The pixel execution mask controls which pixels in a quad are + * modified, according to surrounding conditionals, loops, etc. + */ +static int +get_exec_mask_reg(struct codegen *gen) +{ + if (gen->exec_mask_reg <= 0) { + gen->exec_mask_reg = spe_allocate_available_register(gen->f); + + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:"); + + /* exec_mask = {~0, ~0, ~0, ~0} */ + spe_load_int(gen->f, gen->exec_mask_reg, ~0); + + spe_indent(gen->f, -4); + } + + return gen->exec_mask_reg; +} + + +/** * Return the index of the SPU temporary containing the named TGSI * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we * just return the corresponding SPE register. If the TGIS register @@ -136,37 +177,88 @@ get_src_reg(struct codegen *gen, int channel, const struct tgsi_full_src_register *src) { - int reg; - - /* XXX need to examine src swizzle info here. - * That will involve changing the channel var... - */ + int reg = -1; + int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel); + boolean reg_is_itemp = FALSE; + uint sign_op; + assert(swizzle >= TGSI_SWIZZLE_X); + assert(swizzle <= TGSI_EXTSWIZZLE_ONE); switch (src->SrcRegister.File) { case TGSI_FILE_TEMPORARY: - reg = gen->temp_regs[src->SrcRegister.Index][channel]; + reg = gen->temp_regs[src->SrcRegister.Index][swizzle]; break; case TGSI_FILE_INPUT: { - /* offset is measured in quadwords, not bytes */ - int offset = src->SrcRegister.Index * 4 + channel; - reg = get_itemp(gen); - /* Load: reg = memory[(machine_reg) + offset] */ - spe_lqd(gen->f, reg, gen->inputs_reg, offset); -#if DISASSEM - printf("lqd\tr%d, r%d + %d\n", reg, gen->inputs_reg, offset); -#endif + if(swizzle == TGSI_EXTSWIZZLE_ONE) + { + /* Load const one float and early out */ + reg = get_const_one_reg(gen); + } + else if(swizzle == TGSI_EXTSWIZZLE_ZERO) + { + /* Load const zero float and early out */ + reg = get_itemp(gen); + spe_xor(gen->f, reg, reg, reg); + } + else + { + /* offset is measured in quadwords, not bytes */ + int offset = src->SrcRegister.Index * 4 + swizzle; + reg = get_itemp(gen); + reg_is_itemp = TRUE; + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->inputs_reg, offset); + } } break; case TGSI_FILE_IMMEDIATE: - /* xxx fall-through for now / fix */ + reg = gen->imm_regs[src->SrcRegister.Index][swizzle]; + break; case TGSI_FILE_CONSTANT: /* xxx fall-through for now / fix */ default: assert(0); } + /* + * Handle absolute value, negate or set-negative of src register. + */ + sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); + if (sign_op != TGSI_UTIL_SIGN_KEEP) { + /* + * All sign ops are done by manipulating bit 31, the IEEE float sign bit. + */ + const int bit31mask_reg = get_itemp(gen); + int result_reg; + + if (reg_is_itemp) { + /* re-use 'reg' for the result */ + result_reg = reg; + } + else { + /* alloc a new reg for the result */ + result_reg = get_itemp(gen); + } + + /* mask with bit 31 set, the rest cleared */ + spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + + if (sign_op == TGSI_UTIL_SIGN_CLEAR) { + spe_andc(gen->f, result_reg, reg, bit31mask_reg); + } + else if (sign_op == TGSI_UTIL_SIGN_SET) { + spe_and(gen->f, result_reg, reg, bit31mask_reg); + } + else { + assert(sign_op == TGSI_UTIL_SIGN_TOGGLE); + spe_xor(gen->f, result_reg, reg, bit31mask_reg); + } + + reg = result_reg; + } + return reg; } @@ -183,11 +275,14 @@ get_dst_reg(struct codegen *gen, int channel, const struct tgsi_full_dst_register *dest) { - int reg; + int reg = -1; switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - reg = gen->temp_regs[dest->DstRegister.Index][channel]; + if (gen->if_nesting > 0) + reg = get_itemp(gen); + else + reg = gen->temp_regs[dest->DstRegister.Index][channel]; break; case TGSI_FILE_OUTPUT: reg = get_itemp(gen); @@ -213,17 +308,40 @@ store_dest_reg(struct codegen *gen, { switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - /* no-op */ + if (gen->if_nesting > 0) { + int d_reg = gen->temp_regs[dest->DstRegister.Index][channel]; + int exec_reg = get_exec_mask_reg(gen); + /* Mix d with new value according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg); + } + else { + /* we're not inside a condition or loop: do nothing special */ + } break; case TGSI_FILE_OUTPUT: { /* offset is measured in quadwords, not bytes */ int offset = dest->DstRegister.Index * 4 + channel; - /* Store: memory[(machine_reg) + offset] = reg */ - spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); -#if DISASSEM - printf("stqd\tr%d, r%d + %d\n", value_reg, gen->outputs_reg, offset); -#endif + if (gen->if_nesting > 0) { + int exec_reg = get_exec_mask_reg(gen); + int curval_reg = get_itemp(gen); + /* First read the current value from memory: + * Load: curval = memory[(machine_reg) + offset] + */ + spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset); + /* Mix curval with newvalue according to exec mask: + * d[i] = mask_reg[i] ? value_reg : d_reg + */ + spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg); + /* Store: memory[(machine_reg) + offset] = curval */ + spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset); + } + else { + /* Store: memory[(machine_reg) + offset] = reg */ + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); + } } break; default: @@ -236,15 +354,13 @@ static boolean emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "MOV:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); /* XXX we don't always need to actually emit a mov instruction here */ spe_move(gen->f, dst_reg, src_reg); -#if DISASSEM - printf("mov\tr%d, r%d\n", dst_reg, src_reg); -#endif store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -252,7 +368,6 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } - /** * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD * becomes (up to) four SPU "fa" instructions because we're doing SOA @@ -262,6 +377,7 @@ static boolean emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "ADD:"); /* Loop over Red/Green/Blue/Alpha channels */ for (ch = 0; ch < 4; ch++) { /* If the dest R, G, B or A writemask is enabled... */ @@ -273,9 +389,6 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) /* Emit actual SPE instruction: d = s1 + s2 */ spe_fa(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fa\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); @@ -286,6 +399,82 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +/** + * Emit subtract. See emit_ADD for comments. + */ +static boolean +emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "SUB:"); + /* Loop over Red/Green/Blue/Alpha channels */ + for (ch = 0; ch < 4; ch++) { + /* If the dest R, G, B or A writemask is enabled... */ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + /* get indexes of the two src, one dest SPE registers */ + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* Emit actual SPE instruction: d = s1 - s2 */ + spe_fs(gen->f, d_reg, s1_reg, s2_reg); + + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + /* Free any intermediate temps we allocated */ + free_itemps(gen); + } + } + return true; +} + +/** + * Emit multiply add. See emit_ADD for comments. + */ +static boolean +emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "MAD:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s1 * s2 + s3 */ + spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + + +/** + * Emit linear interpolate. See emit_ADD for comments. + */ +static boolean +emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "LERP:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = s3 + s1(s2 - s3) */ + spe_fs(gen->f, d_reg, s2_reg, s3_reg); + spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} /** * Emit multiply. See emit_ADD for comments. @@ -294,6 +483,7 @@ static boolean emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "MUL:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); @@ -301,9 +491,6 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); /* d = s1 * s2 */ spe_fm(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fm\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -311,6 +498,247 @@ emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +/** + * Emit reciprocal. See emit_ADD for comments. + */ +static boolean +emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "RCP:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = 1/s1 */ + spe_frest(gen->f, d_reg, s1_reg); + spe_fi(gen->f, d_reg, s1_reg, d_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit reciprocal sqrt. See emit_ADD for comments. + */ +static boolean +emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "RSQ:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* d = 1/s1 */ + spe_frsqest(gen->f, d_reg, s1_reg); + spe_fi(gen->f, d_reg, s1_reg, d_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit absolute value. See emit_ADD for comments. + */ +static boolean +emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "ABS:"); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + const int bit31mask_reg = get_itemp(gen); + + /* mask with bit 31 set, the rest cleared */ + spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + + /* d = sign bit cleared in s1 */ + spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + return true; +} + +/** + * Emit 3 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "DP3:"); + + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + /* t = x0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = y0 * y1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = z0 * z1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + return true; +} + +/** + * Emit 4 component dot product. See emit_ADD for comments. + */ +static boolean +emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "DP4:"); + + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + /* t = x0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = y0 * y1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = z0 * z1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); + /* t = w0 * w1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + return true; +} + +/** + * Emit homogeneous dot product. See emit_ADD for comments. + */ +static boolean +emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + spe_comment(gen->f, -4, "DPH:"); + + int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + + /* t = x0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = y0 * y1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = z0 * z1 + t */ + spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); + /* t = w1 + t */ + spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + } + } + + free_itemps(gen); + return true; +} + +/** + * Emit cross product. See emit_ADD for comments. + */ +static boolean +emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + spe_comment(gen->f, -4, "XPD:"); + + int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + int tmp_reg = get_itemp(gen); + + /* t = z0 * y1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = y0 * z1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) { + store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]); + } + + s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + /* t = x0 * z1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + /* t = z0 * x1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) { + store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]); + } + + s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + /* t = y0 * x1 */ + spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + + s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + /* t = x0 * y1 - t */ + spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) { + store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return true; +} /** * Emit set-if-greater-than. @@ -323,6 +751,8 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + spe_comment(gen->f, -4, "SGT:"); + for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); @@ -331,17 +761,473 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) /* d = (s1 > s2) */ spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); -#if DISASSEM - printf("fcgt\tr%d, r%d, r%d\n", d_reg, s1_reg, s2_reg); -#endif /* convert d from 0x0/0xffffffff to 0.0/1.0 */ /* d = d & one_reg */ spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); -#if DISASSEM - printf("and\tr%d, r%d, r%d\n", d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_less-then. See emit_SGT for comments. + */ +static boolean +emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SLT:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 < s2) */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_greater-then-or-equal. See emit_SGT for comments. + */ +static boolean +emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SGE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 >= s2) */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = ~d & one_reg */ + spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_less-then-or-equal. See emit_SGT for comments. + */ +static boolean +emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SLE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 <= s2) */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = ~d & one_reg */ + spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_equal. See emit_SGT for comments. + */ +static boolean +emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SEQ:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 == s2) */ + spe_fceq(gen->f, d_reg, s1_reg, s2_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit set-if_not_equal. See emit_SGT for comments. + */ +static boolean +emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "SNE:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 != s2) */ + spe_fceq(gen->f, d_reg, s1_reg, s2_reg); + spe_nor(gen->f, d_reg, d_reg, d_reg); + + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + /* d = d & one_reg */ + spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit compare. See emit_SGT for comments. + */ +static boolean +emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "CMP:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int zero_reg = get_itemp(gen); + + spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + + /* d = (s1 < 0) ? s2 : s3 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit trunc. + * Convert float to signed int + * Convert signed int to float + */ +static boolean +emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "TRUNC:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* Convert float to int */ + spe_cflts(gen->f, d_reg, s1_reg, 0); + + /* Convert int to float */ + spe_csflt(gen->f, d_reg, d_reg, 0); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit floor. + * If negative int subtract one + * Convert float to signed int + * Convert signed int to float + */ +static boolean +emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "FLR:"); + + int zero_reg = get_itemp(gen); + spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int tmp_reg = get_itemp(gen); + + /* If negative, subtract 1.0 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg); + spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + + /* Convert float to int */ + spe_cflts(gen->f, d_reg, d_reg, 0); + + /* Convert int to float */ + spe_csflt(gen->f, d_reg, d_reg, 0); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit frac. + * Input - FLR(Input) + */ +static boolean +emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "FLR:"); + + int zero_reg = get_itemp(gen); + spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int tmp_reg = get_itemp(gen); + + /* If negative, subtract 1.0 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg); + spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + + /* Convert float to int */ + spe_cflts(gen->f, d_reg, d_reg, 0); + + /* Convert int to float */ + spe_csflt(gen->f, d_reg, d_reg, 0); + + /* d = s1 - FLR(s1) */ + spe_fs(gen->f, d_reg, s1_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + +#if 0 +static void +print_functions(struct cell_context *cell) +{ + struct cell_spu_function_info *funcs = &cell->spu_functions; + uint i; + for (i = 0; i < funcs->num; i++) { + printf("SPU func %u: %s at %u\n", + i, funcs->names[i], funcs->addrs[i]); + } +} #endif + +/** + * Emit code to call a SPU function. + * Used to implement instructions like SIN/COS/POW/TEX/etc. + */ +static boolean +emit_function_call(struct codegen *gen, + const struct tgsi_full_instruction *inst, + char *funcname, uint num_args) +{ + const struct cell_spu_function_info *funcs = &gen->cell->spu_functions; + char comment[100]; + uint addr; + int ch; + + /* XXX temporary value */ + const int frameSize = 64; /* stack frame (activation record) size */ + + assert(num_args <= 3); + + /* lookup function address */ + { + uint i; + addr = 0; + for (i = 0; i < funcs->num; i++) { + if (strcmp(funcs->names[i], funcname) == 0) { + addr = funcs->addrs[i]; + } + } + assert(addr && "spu function not found"); + } + + addr /= 4; /* discard 2 least significant bits */ + + snprintf(comment, sizeof(comment), "CALL %s:", funcname); + spe_comment(gen->f, -4, comment); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int s_regs[3]; + uint a; + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); + } + + /* Basically: + * save registers on stack + * move parameters to registers 3, 4, 5... + * call function + * save return value (reg 3) + * restore registers from stack + */ + + /* XXX hack: load first function param */ + spe_move(gen->f, 3, s_regs[0]); + + /* save $lr on stack # stqd $lr,16($sp) */ + spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + /* save stack pointer # stqd $sp,-frameSize($sp) */ + spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize); + + /* XXX save registers to stack here */ + + /* adjust stack pointer # ai $sp,$sp,-frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize); + + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* restore stack pointer # ai $sp,$sp,frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize); + + /* XXX restore registers from stack here */ + + /* restore $lr # lqd $lr,16($sp) */ + spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + + /* XXX hack: save function's return value */ + spe_move(gen->f, d_reg, 3); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + + +/** + * Emit max. See emit_SGT for comments. + */ +static boolean +emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "MAX:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s1 > s2) ? s1 : s2 */ + spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); + spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return true; +} + +/** + * Emit max. See emit_SGT for comments. + */ +static boolean +emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch; + + spe_comment(gen->f, -4, "MIN:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + /* d = (s2 > s1) ? s1 : s2 */ + spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); + spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -350,6 +1236,97 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) return true; } +static boolean +emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int channel = 0; + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "IF:"); + + /* update execution mask with the predicate register */ + int tmp_reg = get_itemp(gen); + int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]); + + /* tmp = (s1_reg == 0) */ + spe_ceqi(gen->f, tmp_reg, s1_reg, 0); + /* tmp = !tmp */ + spe_complement(gen->f, tmp_reg, tmp_reg); + /* exec_mask = exec_mask & tmp */ + spe_and(gen->f, exec_reg, exec_reg, tmp_reg); + + gen->if_nesting++; + + free_itemps(gen); + + return true; +} + + +static boolean +emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "ELSE:"); + + /* exec_mask = !exec_mask */ + spe_complement(gen->f, exec_reg, exec_reg); + + return true; +} + + +static boolean +emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int exec_reg = get_exec_mask_reg(gen); + + spe_comment(gen->f, -4, "ENDIF:"); + + /* XXX todo: pop execution mask */ + + spe_load_int(gen->f, exec_reg, ~0x0); + + gen->if_nesting--; + return true; +} + + +static boolean +emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst, + boolean ddx) +{ + int ch; + + spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:"); + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + + int t1_reg = get_itemp(gen); + int t2_reg = get_itemp(gen); + + spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ + if (ddx) { + spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ + } + else { + spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ + } + spe_fs(gen->f, d_reg, t2_reg, t1_reg); + + free_itemps(gen); + } + } + + return true; +} + + + /** * Emit END instruction. @@ -361,11 +1338,9 @@ emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_END(struct codegen *gen) { + spe_comment(gen->f, -4, "END:"); /* return from function call */ spe_bi(gen->f, SPE_REG_RA, 0, 0); -#if DISASSEM - printf("bi\trRA\n"); -#endif return true; } @@ -379,19 +1354,83 @@ emit_instruction(struct codegen *gen, { switch (inst->Instruction.Opcode) { case TGSI_OPCODE_MOV: + case TGSI_OPCODE_SWZ: return emit_MOV(gen, inst); case TGSI_OPCODE_MUL: return emit_MUL(gen, inst); case TGSI_OPCODE_ADD: return emit_ADD(gen, inst); + case TGSI_OPCODE_SUB: + return emit_SUB(gen, inst); + case TGSI_OPCODE_MAD: + return emit_MAD(gen, inst); + case TGSI_OPCODE_LERP: + return emit_LERP(gen, inst); + case TGSI_OPCODE_DP3: + return emit_DP3(gen, inst); + case TGSI_OPCODE_DP4: + return emit_DP4(gen, inst); + case TGSI_OPCODE_DPH: + return emit_DPH(gen, inst); + case TGSI_OPCODE_XPD: + return emit_XPD(gen, inst); + case TGSI_OPCODE_RCP: + return emit_RCP(gen, inst); + case TGSI_OPCODE_RSQ: + return emit_RSQ(gen, inst); + case TGSI_OPCODE_ABS: + return emit_ABS(gen, inst); case TGSI_OPCODE_SGT: return emit_SGT(gen, inst); + case TGSI_OPCODE_SLT: + return emit_SLT(gen, inst); + case TGSI_OPCODE_SGE: + return emit_SGE(gen, inst); + case TGSI_OPCODE_SLE: + return emit_SLE(gen, inst); + case TGSI_OPCODE_SEQ: + return emit_SEQ(gen, inst); + case TGSI_OPCODE_SNE: + return emit_SNE(gen, inst); + case TGSI_OPCODE_CMP: + return emit_CMP(gen, inst); + case TGSI_OPCODE_MAX: + return emit_MAX(gen, inst); + case TGSI_OPCODE_MIN: + return emit_MIN(gen, inst); + case TGSI_OPCODE_TRUNC: + return emit_TRUNC(gen, inst); + case TGSI_OPCODE_FLR: + return emit_FLR(gen, inst); + case TGSI_OPCODE_FRC: + return emit_FRC(gen, inst); case TGSI_OPCODE_END: return emit_END(gen); + case TGSI_OPCODE_COS: + return emit_function_call(gen, inst, "spu_cos", 1); + case TGSI_OPCODE_SIN: + return emit_function_call(gen, inst, "spu_sin", 1); + case TGSI_OPCODE_POW: + return emit_function_call(gen, inst, "spu_pow", 2); + + case TGSI_OPCODE_IF: + return emit_IF(gen, inst); + case TGSI_OPCODE_ELSE: + return emit_ELSE(gen, inst); + case TGSI_OPCODE_ENDIF: + return emit_ENDIF(gen, inst); + + case TGSI_OPCODE_DDX: + return emit_DDX_DDY(gen, inst, true); + case TGSI_OPCODE_DDY: + return emit_DDX_DDY(gen, inst, false); + /* XXX lots more cases to do... */ default: + fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", + inst->Instruction.Opcode); return false; } @@ -401,45 +1440,88 @@ emit_instruction(struct codegen *gen, /** + * Emit code for a TGSI immediate value (vector of four floats). + * This involves register allocation and initialization. + * XXX the initialization should be done by a "prepare" stage, not + * per quad execution! + */ +static boolean +emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) +{ + int ch; + + assert(gen->num_imm < MAX_TEMPS); + + spe_comment(gen->f, -4, "IMMEDIATE:"); + + for (ch = 0; ch < 4; ch++) { + float val = immed->u.ImmediateFloat32[ch].Float; + int reg = spe_allocate_available_register(gen->f); + + if (reg < 0) + return false; + + /* update immediate map */ + gen->imm_regs[gen->num_imm][ch] = reg; + + /* emit initializer instruction */ + spe_load_float(gen->f, reg, val); + } + + gen->num_imm++; + + return true; +} + + + +/** * Emit "code" for a TGSI declaration. * We only care about TGSI TEMPORARY register declarations at this time. * For each TGSI TEMPORARY we allocate four SPE registers. */ -static void -emit_declaration(struct codegen *gen, const struct tgsi_full_declaration *decl) +static boolean +emit_declaration(struct cell_context *cell, + struct codegen *gen, const struct tgsi_full_declaration *decl) { int i, ch; switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: -#if DISASSEM - printf("Declare temp reg %d .. %d\n", - decl->DeclarationRange.First, - decl->DeclarationRange.Last); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("Declare temp reg %d .. %d\n", + decl->DeclarationRange.First, + decl->DeclarationRange.Last); + } + for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) { + assert(i < MAX_TEMPS); for (ch = 0; ch < 4; ch++) { gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); + if (gen->temp_regs[i][ch] < 0) + return false; /* out of regs */ } /* XXX if we run out of SPE registers, we need to spill * to SPU memory. someday... */ -#if DISASSEM - printf(" SPE regs: %d %d %d %d\n", - gen->temp_regs[i][0], - gen->temp_regs[i][1], - gen->temp_regs[i][2], - gen->temp_regs[i][3]); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf(" SPE regs: %d %d %d %d\n", + gen->temp_regs[i][0], + gen->temp_regs[i][1], + gen->temp_regs[i][2], + gen->temp_regs[i][3]); + } } break; default: ; /* ignore */ } + + return true; } @@ -460,6 +1542,7 @@ cell_gen_fragment_program(struct cell_context *cell, struct codegen gen; memset(&gen, 0, sizeof(gen)); + gen.cell = cell; gen.f = f; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ @@ -472,10 +1555,12 @@ cell_gen_fragment_program(struct cell_context *cell, spe_allocate_register(f, gen.outputs_reg); spe_allocate_register(f, gen.constants_reg); -#if DISASSEM - printf("Begin %s\n", __FUNCTION__); - tgsi_dump(tokens, 0); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, true); + spe_indent(f, 8); + printf("Begin %s\n", __FUNCTION__); + tgsi_dump(tokens, 0); + } tgsi_parse_init(&parse, tokens); @@ -484,25 +1569,22 @@ cell_gen_fragment_program(struct cell_context *cell, switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: -#if 0 - if (!note_immediate(&gen, &parse.FullToken.FullImmediate )) - goto fail; -#endif + if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) + gen.error = true; break; case TGSI_TOKEN_TYPE_DECLARATION: - emit_declaration(&gen, &parse.FullToken.FullDeclaration); + if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) + gen.error = true; break; case TGSI_TOKEN_TYPE_INSTRUCTION: - if (!emit_instruction(&gen, &parse.FullToken.FullInstruction )) { + if (!emit_instruction(&gen, &parse.FullToken.FullInstruction)) gen.error = true; - } break; default: assert(0); - } } @@ -512,10 +1594,10 @@ cell_gen_fragment_program(struct cell_context *cell, return emit_END(&gen); } -#if DISASSEM - printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); - printf("End %s\n", __FUNCTION__); -#endif + if (cell->debug_flags & CELL_DEBUG_ASM) { + printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); + printf("End %s\n", __FUNCTION__); + } tgsi_parse_free( &parse ); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c index 06219d4e980..653afc235df 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c @@ -60,6 +60,9 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, struct spe_function *f, int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg) { + /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_ + * quantities. This only makes a difference for 32-bit Z values though. + */ ASSERT(dsa->depth.enabled); switch (dsa->depth.func) { @@ -79,28 +82,28 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa, case PIPE_FUNC_GREATER: /* zmask = (ifragZ > ref) */ - spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); /* mask = (mask & zmask) */ spe_and(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_LESS: /* zmask = (ref > ifragZ) */ - spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); /* mask = (mask & zmask) */ spe_and(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_LEQUAL: /* zmask = (ifragZ > ref) */ - spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); + spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg); /* mask = (mask & ~zmask) */ spe_andc(f, mask_reg, mask_reg, zmask_reg); break; case PIPE_FUNC_GEQUAL: /* zmask = (ref > ifragZ) */ - spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); + spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg); /* mask = (mask & ~zmask) */ spe_andc(f, mask_reg, mask_reg, zmask_reg); break; @@ -229,7 +232,27 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, spe_release_register(f, amask_reg); } +/* This pair of functions is used inline to allocate and deallocate + * optional constant registers. Once a constant is discovered to be + * needed, we will likely need it again, so we don't want to deallocate + * it and have to allocate and load it again unnecessarily. + */ +static inline void +setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value) +{ + if (*is_already_set) return; + *r = spe_allocate_available_register(f); + spe_load_float(f, *r, value); + *is_already_set = true; +} +static inline void +release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r) +{ + if (!*is_already_set) return; + spe_release_register(f, r); + *is_already_set = false; +} /** * Generate SPE code to implement the given blend mode for a quad of pixels. @@ -242,6 +265,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa, */ static void gen_blend(const struct pipe_blend_state *blend, + const struct pipe_blend_color *blend_color, struct spe_function *f, enum pipe_format color_format, int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg, @@ -262,10 +286,17 @@ gen_blend(const struct pipe_blend_state *blend, int fbB_reg = spe_allocate_available_register(f); int fbA_reg = spe_allocate_available_register(f); - int one_reg = spe_allocate_available_register(f); int tmp_reg = spe_allocate_available_register(f); - boolean one_reg_set = false; /* avoid setting one_reg more than once */ + /* Optional constant registers we might or might not end up using; + * if we do use them, make sure we only allocate them once by + * keeping a flag on each one. + */ + boolean one_reg_set = false; + unsigned int one_reg; + boolean constR_reg_set = false, constG_reg_set = false, + constB_reg_set = false, constA_reg_set = false; + unsigned int constR_reg, constG_reg, constB_reg, constA_reg; ASSERT(blend->blend_enable); @@ -346,127 +377,449 @@ gen_blend(const struct pipe_blend_state *blend, spe_release_register(f, mask_reg); } - /* - * Compute Src RGB terms + * Compute Src RGB terms. We're actually looking for the value + * of (the appropriate RGB factors) * (the incoming source RGB color), + * because in some cases (like PIPE_BLENDFACTOR_ONE and + * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math. */ switch (blend->rgb_src_factor) { case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (R,G,B) */ spe_move(f, term1R_reg, fragR_reg); spe_move(f, term1G_reg, fragG_reg); spe_move(f, term1B_reg, fragB_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term1R_reg); - spe_zero(f, term1G_reg); - spe_zero(f, term1B_reg); + /* factors = (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term1R_reg, 0.0f); + spe_load_float(f, term1G_reg, 0.0f); + spe_load_float(f, term1B_reg, 0.0f); break; case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*R, G*G, B*B) */ spe_fm(f, term1R_reg, fragR_reg, fragR_reg); spe_fm(f, term1G_reg, fragG_reg, fragG_reg); spe_fm(f, term1B_reg, fragB_reg, fragB_reg); break; case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (R*A, G*A, B*A) */ spe_fm(f, term1R_reg, fragR_reg, fragA_reg); spe_fm(f, term1G_reg, fragG_reg, fragA_reg); spe_fm(f, term1B_reg, fragB_reg, fragA_reg); break; - /* XXX more cases */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) + * or in other words term = (R-R*R, G-G*G, B-B*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */ + spe_fm(f, term1R_reg, fragR_reg, fbR_reg); + spe_fm(f, term1G_reg, fragG_reg, fbG_reg); + spe_fm(f, term1B_reg, fragB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) + * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) + * or term = (R-R*A,G-G*A,B-B*A) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */ + spe_fm(f, term1R_reg, fragR_reg, fbA_reg); + spe_fm(f, term1G_reg, fragG_reg, fbA_reg); + spe_fm(f, term1B_reg, fragB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) + * or term = (R-R*Afb,G-G*Afb,b-B*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */ + spe_fm(f, term1R_reg, fragR_reg, constR_reg); + spe_fm(f, term1G_reg, fragG_reg, constG_reg); + spe_fm(f, term1B_reg, fragB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */ + spe_fm(f, term1R_reg, fragR_reg, constA_reg); + spe_fm(f, term1G_reg, fragG_reg, constA_reg); + spe_fm(f, term1B_reg, fragB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) + * or term = (R-R*Rc, G-G*Gc, B-B*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) + * or term = (R-R*Ac,G-G*Ac,B-B*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg); + spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg); + spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + /* We'll need the optional {1,1,1,1} register */ + setup_const_register(f, &one_reg_set, &one_reg, 1.0f); + /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so + * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb)) + * We could expand the term (as a*min(b,c) == min(a*b,a*c) + * as long as a is positive), but then we'd have to do three + * spe_float_min() functions instead of one, so this is simpler. + */ + /* tmp = 1 - Afb */ + spe_fs(f, tmp_reg, one_reg, fbA_reg); + /* tmp = min(A,tmp) */ + spe_float_min(f, tmp_reg, fragA_reg, tmp_reg); + /* term = R*tmp */ + spe_fm(f, term1R_reg, fragR_reg, tmp_reg); + spe_fm(f, term1G_reg, fragG_reg, tmp_reg); + spe_fm(f, term1B_reg, fragB_reg, tmp_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: ASSERT(0); } /* - * Compute Src Alpha term + * Compute Src Alpha term. Like the above, we're looking for + * the full term A*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->alpha_src_factor) { + case PIPE_BLENDFACTOR_ZERO: + /* factor = 0, so term = 0 */ + spe_load_float(f, term1A_reg, 0.0f); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */ case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = A */ spe_move(f, term1A_reg, fragA_reg); break; + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = A*A */ spe_fm(f, term1A_reg, fragA_reg, fragA_reg); break; case PIPE_BLENDFACTOR_SRC_ALPHA: spe_fm(f, term1A_reg, fragA_reg, fragA_reg); break; - /* XXX more cases */ + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = A*(1-A) = A-A*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = A*Afb */ + spe_fm(f, term1A_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = A*Ac */ + spe_fm(f, term1A_reg, fragA_reg, constA_reg); + break; + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: ASSERT(0); } /* - * Compute Dest RGB terms + * Compute Dest RGB term. Like the above, we're looking for + * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->rgb_dst_factor) { case PIPE_BLENDFACTOR_ONE: + /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */ spe_move(f, term2R_reg, fbR_reg); spe_move(f, term2G_reg, fbG_reg); spe_move(f, term2B_reg, fbB_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term2R_reg); - spe_zero(f, term2G_reg); - spe_zero(f, term2B_reg); + /* factor s= (0,0,0), so term = (0,0,0) */ + spe_load_float(f, term2R_reg, 0.0f); + spe_load_float(f, term2G_reg, 0.0f); + spe_load_float(f, term2B_reg, 0.0f); break; case PIPE_BLENDFACTOR_SRC_COLOR: + /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */ spe_fm(f, term2R_reg, fbR_reg, fragR_reg); spe_fm(f, term2G_reg, fbG_reg, fragG_reg); spe_fm(f, term2B_reg, fbB_reg, fragB_reg); break; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) + * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg); + break; case PIPE_BLENDFACTOR_SRC_ALPHA: + /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */ spe_fm(f, term2R_reg, fbR_reg, fragA_reg); spe_fm(f, term2G_reg, fbG_reg, fragA_reg); spe_fm(f, term2B_reg, fbB_reg, fragA_reg); break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - /* one = {1.0, 1.0, 1.0, 1.0} */ - if (!one_reg_set) { - spe_load_float(f, one_reg, 1.0f); - one_reg_set = true; - } - /* tmp = one - fragA */ - spe_fs(f, tmp_reg, one_reg, fragA_reg); - /* term = fb * tmp */ - spe_fm(f, term2R_reg, fbR_reg, tmp_reg); - spe_fm(f, term2G_reg, fbG_reg, tmp_reg); - spe_fm(f, term2B_reg, fbB_reg, tmp_reg); - break; - /* XXX more cases */ + /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_DST_COLOR: + /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */ + spe_fm(f, term2R_reg, fbR_reg, fbR_reg); + spe_fm(f, term2G_reg, fbG_reg, fbG_reg); + spe_fm(f, term2B_reg, fbB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb)) + * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: + /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */ + spe_fm(f, term2R_reg, fbR_reg, fbA_reg); + spe_fm(f, term2G_reg, fbG_reg, fbA_reg); + spe_fm(f, term2B_reg, fbB_reg, fbA_reg); + break; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) + * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */ + spe_fm(f, term2R_reg, fbR_reg, constR_reg); + spe_fm(f, term2G_reg, fbG_reg, constG_reg); + spe_fm(f, term2B_reg, fbB_reg, constB_reg); + break; + case PIPE_BLENDFACTOR_CONST_ALPHA: + /* we'll need the optional constant alpha register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */ + spe_fm(f, term2R_reg, fbR_reg, constA_reg); + spe_fm(f, term2G_reg, fbG_reg, constA_reg); + spe_fm(f, term2B_reg, fbB_reg, constA_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) + * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + /* We need the optional constant color registers */ + setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]); + setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]); + setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]); + /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac)) + * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac) + * fnms(a,b,c,d) computes a = d - b*c + */ + spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg); + spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg); + spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg); + break; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + default: ASSERT(0); } /* - * Compute Dest Alpha term + * Compute Dest Alpha term. Like the above, we're looking for + * the full term Afb*factor, not just the factor itself, because + * in many cases we can avoid doing unnecessary multiplies. */ switch (blend->alpha_dst_factor) { case PIPE_BLENDFACTOR_ONE: + /* factor = 1, so term = Afb */ spe_move(f, term2A_reg, fbA_reg); break; case PIPE_BLENDFACTOR_ZERO: - spe_zero(f, term2A_reg); + /* factor = 0, so term = 0 */ + spe_load_float(f, term2A_reg, 0.0f); break; - case PIPE_BLENDFACTOR_SRC_ALPHA: + + case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_SRC_COLOR: + /* factor = A, so term = Afb*A */ spe_fm(f, term2A_reg, fbA_reg, fragA_reg); break; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - /* one = {1.0, 1.0, 1.0, 1.0} */ - if (!one_reg_set) { - spe_load_float(f, one_reg, 1.0f); - one_reg_set = true; - } - /* tmp = one - fragA */ - spe_fs(f, tmp_reg, one_reg, fragA_reg); - /* termA = fbA * tmp */ - spe_fm(f, term2A_reg, fbA_reg, tmp_reg); + + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_DST_COLOR: + /* factor = Afb, so term = Afb*Afb */ + spe_fm(f, term2A_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_DST_COLOR: + /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = Ac, so term = Afb*Ac */ + spe_fm(f, term2A_reg, fbA_reg, constA_reg); break; - /* XXX more cases */ + + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */ + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + /* We need the optional constA_reg register */ + setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]); + /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */ + /* fnms(a,b,c,d) computes a = d - b*c */ + spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg); + break; + + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */ + ASSERT(0); + break; + + /* These are special D3D cases involving a second color output + * from the fragment shader. I'm not sure we can support them + * yet... XXX + */ + case PIPE_BLENDFACTOR_SRC1_COLOR: + case PIPE_BLENDFACTOR_SRC1_ALPHA: + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: ASSERT(0); } /* - * Combine Src/Dest RGB terms + * Combine Src/Dest RGB terms as per the blend equation. */ switch (blend->rgb_func) { case PIPE_BLEND_ADD: @@ -479,7 +832,21 @@ gen_blend(const struct pipe_blend_state *blend, spe_fs(f, fragG_reg, term1G_reg, term2G_reg); spe_fs(f, fragB_reg, term1B_reg, term2B_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragR_reg, term2R_reg, term1R_reg); + spe_fs(f, fragG_reg, term2G_reg, term1G_reg); + spe_fs(f, fragB_reg, term2B_reg, term1B_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_min(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_min(f, fragB_reg, term1B_reg, term2B_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragR_reg, term1R_reg, term2R_reg); + spe_float_max(f, fragG_reg, term1G_reg, term2G_reg); + spe_float_max(f, fragB_reg, term1B_reg, term2B_reg); + break; default: ASSERT(0); } @@ -494,7 +861,15 @@ gen_blend(const struct pipe_blend_state *blend, case PIPE_BLEND_SUBTRACT: spe_fs(f, fragA_reg, term1A_reg, term2A_reg); break; - /* XXX more cases */ + case PIPE_BLEND_REVERSE_SUBTRACT: + spe_fs(f, fragA_reg, term2A_reg, term1A_reg); + break; + case PIPE_BLEND_MIN: + spe_float_min(f, fragA_reg, term1A_reg, term2A_reg); + break; + case PIPE_BLEND_MAX: + spe_float_max(f, fragA_reg, term1A_reg, term2A_reg); + break; default: ASSERT(0); } @@ -514,8 +889,14 @@ gen_blend(const struct pipe_blend_state *blend, spe_release_register(f, fbB_reg); spe_release_register(f, fbA_reg); - spe_release_register(f, one_reg); spe_release_register(f, tmp_reg); + + /* Free any optional registers that actually got used */ + release_const_register(f, &one_reg_set, one_reg); + release_const_register(f, &constR_reg_set, constR_reg); + release_const_register(f, &constG_reg_set, constG_reg); + release_const_register(f, &constB_reg_set, constB_reg); + release_const_register(f, &constA_reg_set, constA_reg); } @@ -524,24 +905,74 @@ gen_logicop(const struct pipe_blend_state *blend, struct spe_function *f, int fragRGBA_reg, int fbRGBA_reg) { - /* XXX to-do */ - /* operate on 32-bit packed pixels, not float colors */ -} - - -static void -gen_colormask(uint colormask, - struct spe_function *f, - int fragRGBA_reg, int fbRGBA_reg) -{ - /* XXX to-do */ - /* operate on 32-bit packed pixels, not float colors */ + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. + * */ + ASSERT(blend->logicop_enable); + + switch(blend->logicop_func) { + case PIPE_LOGICOP_CLEAR: /* 0 */ + spe_zero(f, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOR: /* ~(s | d) */ + spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY_INVERTED: /* ~s */ + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */ + /* andc R, A, B computes R = A & ~B */ + spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_INVERT: /* ~d */ + /* Note that (A nor A) == ~(A|A) == ~A */ + spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_XOR: /* s ^ d */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_NAND: /* ~(s & d) */ + spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_AND: /* s & d */ + spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */ + spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + spe_complement(f, fragRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_NOOP: /* d */ + spe_move(f, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg); + break; + case PIPE_LOGICOP_COPY: /* s */ + break; + case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */ + /* orc R, A, B computes R = A | ~B */ + spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_OR: /* s | d */ + spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg); + break; + case PIPE_LOGICOP_SET: /* 1 */ + spe_load_int(f, fragRGBA_reg, 0xffffffff); + break; + default: + ASSERT(0); + } } - /** - * Generate code to pack a quad of float colors into a four 32-bit integers. + * Generate code to pack a quad of float colors into four 32-bit integers. * * \param f SPE function to append instruction onto. * \param color_format the dest color packing format @@ -557,13 +988,16 @@ gen_pack_colors(struct spe_function *f, int r_reg, int g_reg, int b_reg, int a_reg, int rgba_reg) { + int rg_reg = spe_allocate_available_register(f); + int ba_reg = spe_allocate_available_register(f); + /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */ spe_cfltu(f, r_reg, r_reg, 32); spe_cfltu(f, g_reg, g_reg, 32); spe_cfltu(f, b_reg, b_reg, 32); spe_cfltu(f, a_reg, a_reg, 32); - /* Shift the most significant bytes to least the significant positions. + /* Shift the most significant bytes to the least significant positions. * I.e.: reg = reg >> 24 */ spe_rotmi(f, r_reg, r_reg, -24); @@ -595,13 +1029,93 @@ gen_pack_colors(struct spe_function *f, * OR-ing all those together gives us four packed colors: * RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699} */ - spe_or(f, rgba_reg, r_reg, g_reg); - spe_or(f, rgba_reg, rgba_reg, b_reg); - spe_or(f, rgba_reg, rgba_reg, a_reg); + spe_or(f, rg_reg, r_reg, g_reg); + spe_or(f, ba_reg, a_reg, b_reg); + spe_or(f, rgba_reg, rg_reg, ba_reg); + + spe_release_register(f, rg_reg); + spe_release_register(f, ba_reg); } +static void +gen_colormask(struct spe_function *f, + uint colormask, + enum pipe_format color_format, + int fragRGBA_reg, int fbRGBA_reg) +{ + /* We've got four 32-bit RGBA packed pixels in each of + * fragRGBA_reg and fbRGBA_reg, not sets of floating-point + * reds, greens, blues, and alphas. Further, the pixels + * are packed according to the given color format, not + * necessarily RGBA... + */ + unsigned int r_mask; + unsigned int g_mask; + unsigned int b_mask; + unsigned int a_mask; + + /* Calculate exactly where the bits for any particular color + * end up, so we can mask them correctly. + */ + switch(color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + /* ARGB */ + a_mask = 0xff000000; + r_mask = 0x00ff0000; + g_mask = 0x0000ff00; + b_mask = 0x000000ff; + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + /* BGRA */ + b_mask = 0xff000000; + g_mask = 0x00ff0000; + r_mask = 0x0000ff00; + a_mask = 0x000000ff; + break; + default: + ASSERT(0); + } + + /* For each R, G, B, and A component we're supposed to mask out, + * clear its bits. Then our mask operation later will work + * as expected. + */ + if (!(colormask & PIPE_MASK_R)) { + r_mask = 0; + } + if (!(colormask & PIPE_MASK_G)) { + g_mask = 0; + } + if (!(colormask & PIPE_MASK_B)) { + b_mask = 0; + } + if (!(colormask & PIPE_MASK_A)) { + a_mask = 0; + } + + /* Get a temporary register to hold the mask that will be applied to the fragment */ + int colormask_reg = spe_allocate_available_register(f); + /* The actual mask we're going to use is an OR of the remaining R, G, B, and A + * masks. Load the result value into our temporary register. + */ + spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask); + + /* Use the mask register to select between the fragment color + * values and the frame buffer color values. Wherever the + * mask has a 0 bit, the current frame buffer color should override + * the fragment color. Wherever the mask has a 1 bit, the + * fragment color should persevere. The Select Bits (selb rt, rA, rB, rM) + * instruction will select bits from its first operand rA wherever the + * the mask bits rM are 0, and from its second operand rB wherever the + * mask bits rM are 1. That means that the frame buffer color is the + * first operand, and the fragment color the second. + */ + spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg); + /* Release the temporary register and we're done */ + spe_release_register(f, colormask_reg); +} /** * Generate SPE code to implement the fragment operations (alpha test, @@ -626,9 +1140,9 @@ gen_pack_colors(struct spe_function *f, void cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) { - const struct pipe_depth_stencil_alpha_state *dsa = - &cell->depth_stencil->base; - const struct pipe_blend_state *blend = &cell->blend->base; + const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil; + const struct pipe_blend_state *blend = cell->blend; + const struct pipe_blend_color *blend_color = &cell->blend_color; const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ @@ -652,6 +1166,13 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) int fbZS_reg; /**< framebuffer's combined z/stencil values for quad */ spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_print_code(f, true); + spe_indent(f, 8); + spe_comment(f, -4, "Begin per-fragment ops"); + } + spe_allocate_register(f, x_reg); spe_allocate_register(f, y_reg); spe_allocate_register(f, color_tile_reg); @@ -674,8 +1195,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) ASSERT(TILE_SIZE == 32); - spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */ + spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */ spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */ spe_a(f, quad_offset_reg, y2_reg, x2_reg); /* offset = y2 + x2 */ spe_shli(f, quad_offset_reg, quad_offset_reg, 4); /* offset *= 16 */ @@ -710,33 +1231,49 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_release_register(f, mask_reg); /* OK, fbZ_reg has four 24-bit Z values now */ } + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */ + /* OK, fbZ_reg has four 24-bit Z values now */ + } + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + spe_move(f, fbZ_reg, fbZS_reg); + /* OK, fbZ_reg has four 32-bit Z values now */ + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + spe_move(f, fbZ_reg, fbZS_reg); + /* OK, fbZ_reg has four 16-bit Z values now */ + } else { - /* XXX handle other z/stencil formats */ - ASSERT(0); + ASSERT(0); /* invalid format */ } - /* Convert fragZ values from float[4] to uint[4] */ + /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */ if (zs_format == PIPE_FORMAT_S8Z24_UNORM || zs_format == PIPE_FORMAT_X8Z24_UNORM || zs_format == PIPE_FORMAT_Z24S8_UNORM || zs_format == PIPE_FORMAT_Z24X8_UNORM) { - /* 24-bit Z values */ - int scale_reg = spe_allocate_available_register(f); - - /* scale_reg[0,1,2,3] = float(2^24-1) */ - spe_load_float(f, scale_reg, (float) 0xffffff); - - /* XXX these two instructions might be combined */ - spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */ - spe_cfltu(f, fragZ_reg, fragZ_reg, 0); /* fragZ = (int) fragZ */ - - spe_release_register(f, scale_reg); + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* fragZ = fragZ >> 8 */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -8); } - else { - /* XXX handle 16-bit Z format */ - ASSERT(0); + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + } + else if (zs_format == PIPE_FORMAT_Z16_UNORM) { + /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */ + spe_cfltu(f, fragZ_reg, fragZ_reg, 32); + /* fragZ = fragZ >> 16 */ + spe_rotmi(f, fragZ_reg, fragZ_reg, -16); } } + else { + /* no Z test, but set Z to zero so we don't OR-in garbage below */ + spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */ + } + if (dsa->stencil[0].enabled) { /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */ @@ -751,7 +1288,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) ASSERT(0); } } - + else { + /* no stencil test, but set to zero so we don't OR-in garbage below */ + spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */ + } if (dsa->stencil[0].enabled) { /* XXX this may involve depth testing too */ @@ -779,22 +1319,22 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ } - else if (zs_format == PIPE_FORMAT_S8Z24_UNORM || - zs_format == PIPE_FORMAT_X8Z24_UNORM) { - /* XXX to do */ - ASSERT(0); + else if (zs_format == PIPE_FORMAT_Z24S8_UNORM || + zs_format == PIPE_FORMAT_Z24X8_UNORM) { + spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */ + spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */ + } + else if (zs_format == PIPE_FORMAT_Z32_UNORM) { + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_Z16_UNORM) { - /* XXX to do */ - ASSERT(0); + spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */ } else if (zs_format == PIPE_FORMAT_S8_UNORM) { - /* XXX to do */ - ASSERT(0); + ASSERT(0); /* XXX to do */ } else { - /* bad zs_format */ - ASSERT(0); + ASSERT(0); /* bad zs_format */ } /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */ @@ -816,7 +1356,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) if (blend->blend_enable) { - gen_blend(blend, f, color_format, + gen_blend(blend, blend_color, f, color_format, fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg); } @@ -837,8 +1377,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) gen_logicop(blend, f, rgba_reg, fbRGBA_reg); } - if (blend->colormask != 0xf) { - gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg); + if (blend->colormask != PIPE_MASK_RGBA) { + gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg); } @@ -862,9 +1402,11 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f) spe_bi(f, SPE_REG_RA, 0, 0); /* return from function call */ - spe_release_register(f, fbRGBA_reg); spe_release_register(f, fbZS_reg); spe_release_register(f, quad_offset_reg); -} + if (cell->debug_flags & CELL_DEBUG_ASM) { + spe_comment(f, -4, "End per-fragment ops"); + } +} diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c index 475c6ef0ce6..8c55b8e0933 100644 --- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c +++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c @@ -37,7 +37,6 @@ #include "cell_flush.h" #include "cell_state.h" #include "cell_texture.h" -#include "cell_state_per_fragment.h" @@ -45,24 +44,18 @@ static void * cell_create_blend_state(struct pipe_context *pipe, const struct pipe_blend_state *blend) { - struct cell_blend_state *cb = MALLOC(sizeof(struct cell_blend_state)); - - (void) memcpy(cb, blend, sizeof(*blend)); -#if 0 - cell_generate_alpha_blend(cb); -#endif - return cb; + return mem_dup(blend, sizeof(*blend)); } static void -cell_bind_blend_state(struct pipe_context *pipe, void *state) +cell_bind_blend_state(struct pipe_context *pipe, void *blend) { struct cell_context *cell = cell_context(pipe); draw_flush(cell->draw); - cell->blend = (struct cell_blend_state *) state; + cell->blend = (struct pipe_blend_state *) blend; cell->dirty |= CELL_NEW_BLEND; } @@ -70,10 +63,7 @@ cell_bind_blend_state(struct pipe_context *pipe, void *state) static void cell_delete_blend_state(struct pipe_context *pipe, void *blend) { - struct cell_blend_state *cb = (struct cell_blend_state *) blend; - - spe_release_func(& cb->code); - FREE(cb); + FREE(blend); } @@ -95,41 +85,29 @@ cell_set_blend_color(struct pipe_context *pipe, static void * cell_create_depth_stencil_alpha_state(struct pipe_context *pipe, - const struct pipe_depth_stencil_alpha_state *depth_stencil) + const struct pipe_depth_stencil_alpha_state *dsa) { - struct cell_depth_stencil_alpha_state *cdsa = - MALLOC(sizeof(struct cell_depth_stencil_alpha_state)); - - (void) memcpy(cdsa, depth_stencil, sizeof(*depth_stencil)); -#if 0 - cell_generate_depth_stencil_test(cdsa); -#endif - return cdsa; + return mem_dup(dsa, sizeof(*dsa)); } static void cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe, - void *depth_stencil) + void *dsa) { struct cell_context *cell = cell_context(pipe); draw_flush(cell->draw); - cell->depth_stencil = - (struct cell_depth_stencil_alpha_state *) depth_stencil; + cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa; cell->dirty |= CELL_NEW_DEPTH_STENCIL; } static void -cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth) +cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa) { - struct cell_depth_stencil_alpha_state *cdsa = - (struct cell_depth_stencil_alpha_state *) depth; - - spe_release_func(& cdsa->code); - FREE(cdsa); + FREE(dsa); } @@ -191,24 +169,23 @@ cell_set_polygon_stipple( struct pipe_context *pipe, static void * cell_create_rasterizer_state(struct pipe_context *pipe, - const struct pipe_rasterizer_state *setup) + const struct pipe_rasterizer_state *rasterizer) { - struct pipe_rasterizer_state *state - = MALLOC(sizeof(struct pipe_rasterizer_state)); - memcpy(state, setup, sizeof(struct pipe_rasterizer_state)); - return state; + return mem_dup(rasterizer, sizeof(*rasterizer)); } static void -cell_bind_rasterizer_state(struct pipe_context *pipe, void *setup) +cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast) { + struct pipe_rasterizer_state *rasterizer = + (struct pipe_rasterizer_state *) rast; struct cell_context *cell = cell_context(pipe); /* pass-through to draw module */ - draw_set_rasterizer_state(cell->draw, setup); + draw_set_rasterizer_state(cell->draw, rasterizer); - cell->rasterizer = (struct pipe_rasterizer_state *)setup; + cell->rasterizer = rasterizer; cell->dirty |= CELL_NEW_RASTERIZER; } diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c index 139b3719b62..47ba6fa2909 100644 --- a/src/gallium/drivers/cell/ppu/cell_screen.c +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -58,9 +58,9 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: return CELL_MAX_SAMPLERS; case PIPE_CAP_NPOT_TEXTURES: - return 0; + return 1; case PIPE_CAP_TWO_SIDED_STENCIL: - return 0; + return 1; case PIPE_CAP_GLSL: return 1; case PIPE_CAP_S3TC: @@ -68,13 +68,13 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_ANISOTROPIC_FILTER: return 0; case PIPE_CAP_POINT_SPRITE: - return 0; + return 1; case PIPE_CAP_MAX_RENDER_TARGETS: return 1; case PIPE_CAP_OCCLUSION_QUERY: - return 0; + return 1; case PIPE_CAP_TEXTURE_SHADOW_MAP: - return 0; + return 10; case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: return 12; /* max 2Kx2K */ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: @@ -82,7 +82,7 @@ cell_get_param(struct pipe_screen *screen, int param) case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: return 12; /* max 2Kx2K */ default: - return 0; + return 10; } } @@ -108,7 +108,7 @@ cell_get_paramf(struct pipe_screen *screen, int param) return 16.0; /* arbitrary */ default: - return 0; + return 10; } } diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c index 9508227e298..df020c4146d 100644 --- a/src/gallium/drivers/cell/ppu/cell_spu.c +++ b/src/gallium/drivers/cell/ppu/cell_spu.c @@ -36,6 +36,7 @@ #include "cell_spu.h" #include "pipe/p_format.h" #include "pipe/p_state.h" +#include "util/u_memory.h" #include "cell/common.h" @@ -131,6 +132,11 @@ cell_start_spus(struct cell_context *cell) ASSERT_ALIGN16(&cell_global.inits[0]); ASSERT_ALIGN16(&cell_global.inits[1]); + /* + * Initialize the global 'inits' structure for each SPU. + * A pointer to the init struct will be passed to each SPU. + * The SPUs will then each grab their init info with mfc_get(). + */ for (i = 0; i < cell->num_spus; i++) { cell_global.inits[i].id = i; cell_global.inits[i].num_spus = cell->num_spus; @@ -141,6 +147,8 @@ cell_start_spus(struct cell_context *cell) } cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0]; + cell_global.inits[i].spu_functions = &cell->spu_functions; + cell_global.spe_contexts[i] = spe_context_create(0, NULL); if (!cell_global.spe_contexts[i]) { fprintf(stderr, "spe_context_create() failed\n"); diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c index 2da3097983c..f35893537bf 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_emit.c +++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c @@ -30,7 +30,6 @@ #include "cell_gen_fragment.h" #include "cell_state.h" #include "cell_state_emit.h" -#include "cell_state_per_fragment.h" #include "cell_batch.h" #include "cell_texture.h" #include "draw/draw_context.h" @@ -100,14 +99,19 @@ cell_emit_state(struct cell_context *cell) = cell_batch_alloc(cell, sizeof(*fops)); struct spe_function spe_code; + /* Prepare the buffer that will hold the generated code. */ + spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); + /* generate new code */ cell_gen_fragment_function(cell, &spe_code); + /* put the new code into the batch buffer */ fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS; memcpy(&fops->code, spe_code.store, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE); - fops->dsa = cell->depth_stencil->base; - fops->blend = cell->blend->base; + fops->dsa = *cell->depth_stencil; + fops->blend = *cell->blend; + /* free codegen buffer */ spe_release_func(&spe_code); } diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore new file mode 100644 index 00000000000..2be9a2d3242 --- /dev/null +++ b/src/gallium/drivers/cell/spu/.gitignore @@ -0,0 +1 @@ +g3d_spu diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile index 1ae0dfb8c10..116453b79c5 100644 --- a/src/gallium/drivers/cell/spu/Makefile +++ b/src/gallium/drivers/cell/spu/Makefile @@ -16,8 +16,10 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o SOURCES = \ - spu_main.c \ + spu_command.c \ spu_dcache.c \ + spu_funcs.c \ + spu_main.c \ spu_per_fragment_op.c \ spu_render.c \ spu_texture.c \ diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c new file mode 100644 index 00000000000..ec9da5d8870 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_command.c @@ -0,0 +1,599 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * SPU command processing code + */ + + +#include <stdio.h> +#include <libmisc.h> + +#include "pipe/p_defines.h" + +#include "spu_command.h" +#include "spu_main.h" +#include "spu_render.h" +#include "spu_per_fragment_op.h" +#include "spu_texture.h" +#include "spu_tile.h" +#include "spu_vertex_shader.h" +#include "spu_dcache.h" +#include "spu_debug.h" +#include "cell/common.h" + + +struct spu_vs_context draw; + + +/** + * Buffers containing dynamically generated SPU code: + */ +static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] + ALIGN16_ATTRIB; + + + +/** + * Tell the PPU that this SPU has finished copying a buffer to + * local store and that it may be reused by the PPU. + * This is done by writting a 16-byte batch-buffer-status block back into + * main memory (in cell_context->buffer_status[]). + */ +static void +release_buffer(uint buffer) +{ + /* Evidently, using less than a 16-byte status doesn't work reliably */ + static const uint status[4] ALIGN16_ATTRIB + = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; + + const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); + uint *dst = spu.init.buffer_status + index; + + ASSERT(buffer < CELL_NUM_BUFFERS); + + mfc_put((void *) &status, /* src in local memory */ + (unsigned int) dst, /* dst in main memory */ + sizeof(status), /* size */ + TAG_MISC, /* tag is unimportant */ + 0, /* tid */ + 0 /* rid */); +} + + +static void +cmd_clear_surface(const struct cell_command_clear_surface *clear) +{ + DEBUG_PRINTF("CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value); + + if (clear->surface == 0) { + spu.fb.color_clear_value = clear->value; + if (spu.init.debug_flags & CELL_DEBUG_CHECKER) { + uint x = (spu.init.id << 4) | (spu.init.id << 12) | + (spu.init.id << 20) | (spu.init.id << 28); + spu.fb.color_clear_value ^= x; + } + } + else { + spu.fb.depth_clear_value = clear->value; + } + +#define CLEAR_OPT 1 +#if CLEAR_OPT + + /* Simply set all tiles' status to CLEAR. + * When we actually begin rendering into a tile, we'll initialize it to + * the clear value. If any tiles go untouched during the frame, + * really_clear_tiles() will set them to the clear value. + */ + if (clear->surface == 0) { + memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); + } + else { + memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); + } + +#else + + /* + * This path clears the whole framebuffer to the clear color right now. + */ + + /* + printf("SPU: %s num=%d w=%d h=%d\n", + __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles); + */ + + /* init a single tile to the clear value */ + if (clear->surface == 0) { + clear_c_tile(&spu.ctile); + } + else { + clear_z_tile(&spu.ztile); + } + + /* walk over my tiles, writing the 'clear' tile's data */ + { + const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + uint i; + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (clear->surface == 0) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); + else + put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); + } + } + + if (spu.init.debug_flags & CELL_DEBUG_SYNC) { + wait_on_mask(1 << TAG_SURFACE_CLEAR); + } + +#endif /* CLEAR_OPT */ + + DEBUG_PRINTF("CLEAR SURF done\n"); +} + + +static void +cmd_release_verts(const struct cell_command_release_verts *release) +{ + DEBUG_PRINTF("RELEASE VERTS %u\n", release->vertex_buf); + ASSERT(release->vertex_buf != ~0U); + release_buffer(release->vertex_buf); +} + + +/** + * Process a CELL_CMD_STATE_FRAGMENT_OPS command. + * This involves installing new fragment ops SPU code. + * If this function is never called, we'll use a regular C fallback function + * for fragment processing. + */ +static void +cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) +{ + static int warned = 0; + + DEBUG_PRINTF("CMD_STATE_FRAGMENT_OPS\n"); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); + /* Copy state info (for fallback case only) */ + memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); + memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); + + /* Parity twist! For now, always use the fallback code by default, + * only switching to codegen when specifically requested. This + * allows us to develop freely without risking taking down the + * branch. + * + * Later, the parity of this check will be reversed, so that + * codegen is *always* used, unless we specifically indicate that + * we don't want it. + * + * Eventually, the option will be removed completely, because in + * final code we'll always use codegen and won't even provide the + * raw state records that the fallback code requires. + */ + if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) { + spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; + } + else { + /* otherwise, the default fallback code remains in place */ + if (!warned) { + fprintf(stderr, "Cell Warning: using fallback per-fragment code\n"); + warned = 1; + } + } + + spu.read_depth = spu.depth_stencil_alpha.depth.enabled; + spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; +} + + +static void +cmd_state_fragment_program(const struct cell_command_fragment_program *fp) +{ + DEBUG_PRINTF("CMD_STATE_FRAGMENT_PROGRAM\n"); + /* Copy SPU code from batch buffer to spu buffer */ + memcpy(spu.fragment_program_code, fp->code, + SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); +#if 01 + /* Point function pointer at new code */ + spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; +#endif +} + + +static void +cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) +{ + DEBUG_PRINTF("FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", + cmd->width, + cmd->height, + cmd->color_start, + cmd->color_format, + cmd->depth_format); + + ASSERT_ALIGN16(cmd->color_start); + ASSERT_ALIGN16(cmd->depth_start); + + spu.fb.color_start = cmd->color_start; + spu.fb.depth_start = cmd->depth_start; + spu.fb.color_format = cmd->color_format; + spu.fb.depth_format = cmd->depth_format; + spu.fb.width = cmd->width; + spu.fb.height = cmd->height; + spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE; + spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE; + + switch (spu.fb.depth_format) { + case PIPE_FORMAT_Z32_UNORM: + spu.fb.zsize = 4; + spu.fb.zscale = (float) 0xffffffffu; + break; + case PIPE_FORMAT_Z24S8_UNORM: + case PIPE_FORMAT_S8Z24_UNORM: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + spu.fb.zsize = 4; + spu.fb.zscale = (float) 0x00ffffffu; + break; + case PIPE_FORMAT_Z16_UNORM: + spu.fb.zsize = 2; + spu.fb.zscale = (float) 0xffffu; + break; + default: + spu.fb.zsize = 0; + break; + } +} + + +static void +cmd_state_sampler(const struct cell_command_sampler *sampler) +{ + DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit); + + spu.sampler[sampler->unit] = sampler->state; + if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) + spu.sample_texture[sampler->unit] = sample_texture_bilinear; + else + spu.sample_texture[sampler->unit] = sample_texture_nearest; +} + + +static void +cmd_state_texture(const struct cell_command_texture *texture) +{ + const uint unit = texture->unit; + const uint width = texture->width; + const uint height = texture->height; + + DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n", + texture->unit, texture->start, + texture->width, texture->height); + + spu.texture[unit].start = texture->start; + spu.texture[unit].width = width; + spu.texture[unit].height = height; + + spu.texture[unit].tiles_per_row = width / TILE_SIZE; + + spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0}; + spu.texture[unit].tex_size_mask = (vector unsigned int) + { width - 1, height - 1, 0, 0 }; + spu.texture[unit].tex_size_x_mask = spu_splats(width - 1); + spu.texture[unit].tex_size_y_mask = spu_splats(height - 1); +} + + +static void +cmd_state_vertex_info(const struct vertex_info *vinfo) +{ + DEBUG_PRINTF("VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs); + ASSERT(vinfo->num_attribs >= 1); + ASSERT(vinfo->num_attribs <= 8); + memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); +} + + +static void +cmd_state_vs_array_info(const struct cell_array_info *vs_info) +{ + const unsigned attr = vs_info->attr; + + ASSERT(attr < PIPE_MAX_ATTRIBS); + draw.vertex_fetch.src_ptr[attr] = vs_info->base; + draw.vertex_fetch.pitch[attr] = vs_info->pitch; + draw.vertex_fetch.size[attr] = vs_info->size; + draw.vertex_fetch.code_offset[attr] = vs_info->function_offset; + draw.vertex_fetch.dirty = 1; +} + + +static void +cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) +{ + mfc_get(attribute_fetch_code_buffer, + (unsigned int) code->base, /* src */ + code->size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + draw.vertex_fetch.code = attribute_fetch_code_buffer; +} + + +static void +cmd_finish(void) +{ + DEBUG_PRINTF("FINISH\n"); + really_clear_tiles(0); + /* wait for all outstanding DMAs to finish */ + mfc_write_tag_mask(~0); + mfc_read_tag_status_all(); + /* send mbox message to PPU */ + spu_write_out_mbox(CELL_CMD_FINISH); +} + + +/** + * Execute a batch of commands which was sent to us by the PPU. + * See the cell_emit_state.c code to see where the commands come from. + * + * The opcode param encodes the location of the buffer and its size. + */ +static void +cmd_batch(uint opcode) +{ + const uint buf = (opcode >> 8) & 0xff; + uint size = (opcode >> 16); + uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB; + const unsigned usize = size / sizeof(buffer[0]); + uint pos; + + DEBUG_PRINTF("BATCH buffer %u, len %u, from %p\n", + buf, size, spu.init.buffers[buf]); + + ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); + + ASSERT_ALIGN16(spu.init.buffers[buf]); + + size = ROUNDUP16(size); + + ASSERT_ALIGN16(spu.init.buffers[buf]); + + mfc_get(buffer, /* dest */ + (unsigned int) spu.init.buffers[buf], /* src */ + size, + TAG_BATCH_BUFFER, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << TAG_BATCH_BUFFER); + + /* Tell PPU we're done copying the buffer to local store */ + DEBUG_PRINTF("release batch buf %u\n", buf); + release_buffer(buf); + + /* + * Loop over commands in the batch buffer + */ + for (pos = 0; pos < usize; /* no incr */) { + switch (buffer[pos]) { + /* + * rendering commands + */ + case CELL_CMD_CLEAR_SURFACE: + { + struct cell_command_clear_surface *clr + = (struct cell_command_clear_surface *) &buffer[pos]; + cmd_clear_surface(clr); + pos += sizeof(*clr) / 8; + } + break; + case CELL_CMD_RENDER: + { + struct cell_command_render *render + = (struct cell_command_render *) &buffer[pos]; + uint pos_incr; + cmd_render(render, &pos_incr); + pos += pos_incr; + } + break; + /* + * state-update commands + */ + case CELL_CMD_STATE_FRAMEBUFFER: + { + struct cell_command_framebuffer *fb + = (struct cell_command_framebuffer *) &buffer[pos]; + cmd_state_framebuffer(fb); + pos += sizeof(*fb) / 8; + } + break; + case CELL_CMD_STATE_FRAGMENT_OPS: + { + struct cell_command_fragment_ops *fops + = (struct cell_command_fragment_ops *) &buffer[pos]; + cmd_state_fragment_ops(fops); + pos += sizeof(*fops) / 8; + } + break; + case CELL_CMD_STATE_FRAGMENT_PROGRAM: + { + struct cell_command_fragment_program *fp + = (struct cell_command_fragment_program *) &buffer[pos]; + cmd_state_fragment_program(fp); + pos += sizeof(*fp) / 8; + } + break; + case CELL_CMD_STATE_SAMPLER: + { + struct cell_command_sampler *sampler + = (struct cell_command_sampler *) &buffer[pos]; + cmd_state_sampler(sampler); + pos += sizeof(*sampler) / 8; + } + break; + case CELL_CMD_STATE_TEXTURE: + { + struct cell_command_texture *texture + = (struct cell_command_texture *) &buffer[pos]; + cmd_state_texture(texture); + pos += sizeof(*texture) / 8; + } + break; + case CELL_CMD_STATE_VERTEX_INFO: + cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8); + break; + case CELL_CMD_STATE_VIEWPORT: + (void) memcpy(& draw.viewport, &buffer[pos+1], + sizeof(struct pipe_viewport_state)); + pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); + break; + case CELL_CMD_STATE_UNIFORMS: + draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1]; + pos += 2; + break; + case CELL_CMD_STATE_VS_ARRAY_INFO: + cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); + break; + case CELL_CMD_STATE_BIND_VS: +#if 0 + spu_bind_vertex_shader(&draw, + (struct cell_shader_info *) &buffer[pos+1]); +#endif + pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); + break; + case CELL_CMD_STATE_ATTRIB_FETCH: + cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) + &buffer[pos+1]); + pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); + break; + /* + * misc commands + */ + case CELL_CMD_FINISH: + cmd_finish(); + pos += 1; + break; + case CELL_CMD_RELEASE_VERTS: + { + struct cell_command_release_verts *release + = (struct cell_command_release_verts *) &buffer[pos]; + cmd_release_verts(release); + pos += sizeof(*release) / 8; + } + break; + case CELL_CMD_FLUSH_BUFFER_RANGE: { + struct cell_buffer_range *br = (struct cell_buffer_range *) + &buffer[pos+1]; + + spu_dcache_mark_dirty((unsigned) br->base, br->size); + pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8); + break; + } + default: + printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); + ASSERT(0); + break; + } + } + + DEBUG_PRINTF("BATCH complete\n"); +} + + + +/** + * Main loop for SPEs: Get a command, execute it, repeat. + */ +void +command_loop(void) +{ + struct cell_command cmd; + int exitFlag = 0; + + DEBUG_PRINTF("Enter command loop\n"); + + ASSERT((sizeof(struct cell_command) & 0xf) == 0); + ASSERT_ALIGN16(&cmd); + + while (!exitFlag) { + unsigned opcode; + int tag = 0; + + DEBUG_PRINTF("Wait for cmd...\n"); + + /* read/wait from mailbox */ + opcode = (unsigned int) spu_read_in_mbox(); + + DEBUG_PRINTF("got cmd 0x%x\n", opcode); + + /* command payload */ + mfc_get(&cmd, /* dest */ + (unsigned int) spu.init.cmd, /* src */ + sizeof(struct cell_command), /* bytes */ + tag, + 0, /* tid */ + 0 /* rid */); + wait_on_mask( 1 << tag ); + + /* + * NOTE: most commands should be contained in a batch buffer + */ + + switch (opcode & CELL_CMD_OPCODE_MASK) { + case CELL_CMD_EXIT: + DEBUG_PRINTF("EXIT\n"); + exitFlag = 1; + break; + case CELL_CMD_VS_EXECUTE: +#if 0 + spu_execute_vertex_shader(&draw, &cmd.vs); +#endif + break; + case CELL_CMD_BATCH: + cmd_batch(opcode); + break; + default: + printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK); + } + + } + + DEBUG_PRINTF("Exit command loop\n"); + + spu_dcache_report(); +} diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h new file mode 100644 index 00000000000..853e9aa5498 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_command.h @@ -0,0 +1,7 @@ + + + +extern void +command_loop(void); + + diff --git a/src/gallium/drivers/cell/spu/spu_debug.h b/src/gallium/drivers/cell/spu/spu_debug.h new file mode 100644 index 00000000000..eeec0526558 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_debug.h @@ -0,0 +1,60 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef SPU_DEBUG_H +#define SPU_DEBUG_H + + +/* Set to 0 to disable all extraneous debugging code */ +#define DEBUG 1 + +#if DEBUG +extern boolean Debug; +extern boolean force_fragment_ops_fallback; + +/* These debug macros use the unusual construction ", ##__VA_ARGS__" + * which expands to the expected comma + args if variadic arguments + * are supplied, but swallows the comma if there are no variadic + * arguments (which avoids syntax errors that would otherwise occur). + */ +#define DEBUG_PRINTF(format,...) \ + if (Debug) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) +#define D_PRINTF(flag, format,...) \ + if (spu.init.debug_flags & (flag)) \ + printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__) + +#else + +#define DEBUG_PRINTF(...) +#define D_PRINTF(...) + +#endif + + +#endif /* SPU_DEBUG_H */ diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c new file mode 100644 index 00000000000..b57ad3f3b81 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.c @@ -0,0 +1,116 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * SPU functions accessed by shaders. + * + * Authors: Brian Paul + */ + + +#include <string.h> +#include <libmisc.h> +#include <cos8_v.h> +#include <sin8_v.h> + +#include "cell/common.h" +#include "spu_main.h" +#include "spu_funcs.h" + + +#define M_PI 3.1415926 + + +static vector float +spu_cos(vector float x) +{ +#if 0 + static const float scale = 1.0 / (2.0 * M_PI); + x = x * spu_splats(scale); /* normalize */ + return _cos8_v(x); +#else + /* just pass-through to avoid trashing caller's stack */ + return x; +#endif +} + +static vector float +spu_sin(vector float x) +{ +#if 0 + static const float scale = 1.0 / (2.0 * M_PI); + x = x * spu_splats(scale); /* normalize */ + return _sin8_v(x); /* 8-bit accuracy enough?? */ +#else + /* just pass-through to avoid trashing caller's stack */ + return x; +#endif +} + + +static void +add_func(struct cell_spu_function_info *spu_functions, + const char *name, void *addr) +{ + uint n = spu_functions->num; + ASSERT(strlen(name) < 16); + strcpy(spu_functions->names[n], name); + spu_functions->addrs[n] = (uint) addr; + spu_functions->num++; +} + + +/** + * Return info about the SPU's function to the PPU / main memory. + * The PPU needs to know the address of some SPU-side functions so + * that we can generate shader code with function calls. + */ +void +return_function_info(void) +{ + struct cell_spu_function_info funcs ALIGN16_ATTRIB; + int tag = TAG_MISC; + + ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */ + + funcs.num = 0; + add_func(&funcs, "spu_cos", &spu_cos); + add_func(&funcs, "spu_sin", &spu_sin); + + /* Send the function info back to the PPU / main memory */ + mfc_put((void *) &funcs, /* src in local store */ + (unsigned int) spu.init.spu_functions, /* dst in main memory */ + sizeof(funcs), /* bytes */ + tag, + 0, /* tid */ + 0 /* rid */); + wait_on_mask(1 << tag); +} + + + diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h new file mode 100644 index 00000000000..3adb6ae99f9 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_funcs.h @@ -0,0 +1,35 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef SPU_FUNCS_H +#define SPU_FUNCS_H + +extern void +return_function_info(void); + +#endif + diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c index 78260c4259c..4becd0f92a4 100644 --- a/src/gallium/drivers/cell/spu/spu_main.c +++ b/src/gallium/drivers/cell/spu/spu_main.c @@ -32,16 +32,16 @@ #include <stdio.h> #include <libmisc.h> +#include "pipe/p_defines.h" + +#include "spu_funcs.h" +#include "spu_command.h" #include "spu_main.h" -#include "spu_render.h" #include "spu_per_fragment_op.h" #include "spu_texture.h" -#include "spu_tile.h" //#include "spu_test.h" -#include "spu_vertex_shader.h" -#include "spu_dcache.h" +#include "spu_debug.h" #include "cell/common.h" -#include "pipe/p_defines.h" /* @@ -50,599 +50,13 @@ helpful headers: /opt/cell/sdk/usr/include/libmisc.h */ -boolean Debug = FALSE; - struct spu_global spu; -struct spu_vs_context draw; - - -/** - * Buffers containing dynamically generated SPU code: - */ -static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS] - ALIGN16_ATTRIB; - - - -/** - * Tell the PPU that this SPU has finished copying a buffer to - * local store and that it may be reused by the PPU. - * This is done by writting a 16-byte batch-buffer-status block back into - * main memory (in cell_context->buffer_status[]). - */ -static void -release_buffer(uint buffer) -{ - /* Evidently, using less than a 16-byte status doesn't work reliably */ - static const uint status[4] ALIGN16_ATTRIB - = {CELL_BUFFER_STATUS_FREE, 0, 0, 0}; - - const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer); - uint *dst = spu.init.buffer_status + index; - - ASSERT(buffer < CELL_NUM_BUFFERS); - - mfc_put((void *) &status, /* src in local memory */ - (unsigned int) dst, /* dst in main memory */ - sizeof(status), /* size */ - TAG_MISC, /* tag is unimportant */ - 0, /* tid */ - 0 /* rid */); -} - -/** - * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled - * tiles back to the main framebuffer. - */ -static void -really_clear_tiles(uint surfaceIndex) -{ - const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; - uint i; - - if (surfaceIndex == 0) { - clear_c_tile(&spu.ctile); - - for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { - uint tx = i % spu.fb.width_tiles; - uint ty = i / spu.fb.width_tiles; - if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) { - put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); - } - } - } - else { - clear_z_tile(&spu.ztile); - - for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { - uint tx = i % spu.fb.width_tiles; - uint ty = i / spu.fb.width_tiles; - if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR) - put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1); - } - } - -#if 0 - wait_on_mask(1 << TAG_SURFACE_CLEAR); -#endif -} - - -static void -cmd_clear_surface(const struct cell_command_clear_surface *clear) -{ - if (Debug) - printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id, - clear->surface, clear->value); - - if (clear->surface == 0) { - spu.fb.color_clear_value = clear->value; - if (spu.init.debug_flags & CELL_DEBUG_CHECKER) { - uint x = (spu.init.id << 4) | (spu.init.id << 12) | - (spu.init.id << 20) | (spu.init.id << 28); - spu.fb.color_clear_value ^= x; - } - } - else { - spu.fb.depth_clear_value = clear->value; - } - -#define CLEAR_OPT 1 -#if CLEAR_OPT - - /* Simply set all tiles' status to CLEAR. - * When we actually begin rendering into a tile, we'll initialize it to - * the clear value. If any tiles go untouched during the frame, - * really_clear_tiles() will set them to the clear value. - */ - if (clear->surface == 0) { - memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status)); - } - else { - memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status)); - } - -#else - - /* - * This path clears the whole framebuffer to the clear color right now. - */ - - /* - printf("SPU: %s num=%d w=%d h=%d\n", - __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles); - */ - - /* init a single tile to the clear value */ - if (clear->surface == 0) { - clear_c_tile(&spu.ctile); - } - else { - clear_z_tile(&spu.ztile); - } - - /* walk over my tiles, writing the 'clear' tile's data */ - { - const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; - uint i; - for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { - uint tx = i % spu.fb.width_tiles; - uint ty = i / spu.fb.width_tiles; - if (clear->surface == 0) - put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); - else - put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1); - } - } - - if (spu.init.debug_flags & CELL_DEBUG_SYNC) { - wait_on_mask(1 << TAG_SURFACE_CLEAR); - } - -#endif /* CLEAR_OPT */ - - if (Debug) - printf("SPU %u: CLEAR SURF done\n", spu.init.id); -} - - -static void -cmd_release_verts(const struct cell_command_release_verts *release) -{ - if (Debug) - printf("SPU %u: RELEASE VERTS %u\n", - spu.init.id, release->vertex_buf); - ASSERT(release->vertex_buf != ~0U); - release_buffer(release->vertex_buf); -} - - -/** - * Process a CELL_CMD_STATE_FRAGMENT_OPS command. - * This involves installing new fragment ops SPU code. - * If this function is never called, we'll use a regular C fallback function - * for fragment processing. - */ -static void -cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops) -{ - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id); - /* Copy SPU code from batch buffer to spu buffer */ - memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4); - /* Copy state info (for fallback case only) */ - memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa)); - memcpy(&spu.blend, &fops->blend, sizeof(fops->blend)); - - /* Point function pointer at new code */ - spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code; - - spu.read_depth = spu.depth_stencil_alpha.depth.enabled; - spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled; -} - - -static void -cmd_state_fragment_program(const struct cell_command_fragment_program *fp) -{ - if (Debug) - printf("SPU %u: CMD_STATE_FRAGMENT_PROGRAM\n", spu.init.id); - /* Copy SPU code from batch buffer to spu buffer */ - memcpy(spu.fragment_program_code, fp->code, - SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4); -#if 01 - /* Point function pointer at new code */ - spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code; -#endif -} - - -static void -cmd_state_framebuffer(const struct cell_command_framebuffer *cmd) -{ - if (Debug) - printf("SPU %u: FRAMEBUFFER: %d x %d at %p, cformat 0x%x zformat 0x%x\n", - spu.init.id, - cmd->width, - cmd->height, - cmd->color_start, - cmd->color_format, - cmd->depth_format); - - ASSERT_ALIGN16(cmd->color_start); - ASSERT_ALIGN16(cmd->depth_start); - - spu.fb.color_start = cmd->color_start; - spu.fb.depth_start = cmd->depth_start; - spu.fb.color_format = cmd->color_format; - spu.fb.depth_format = cmd->depth_format; - spu.fb.width = cmd->width; - spu.fb.height = cmd->height; - spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE; - spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE; - - switch (spu.fb.depth_format) { - case PIPE_FORMAT_Z32_UNORM: - spu.fb.zsize = 4; - spu.fb.zscale = (float) 0xffffffffu; - break; - case PIPE_FORMAT_Z24S8_UNORM: - case PIPE_FORMAT_S8Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - spu.fb.zsize = 4; - spu.fb.zscale = (float) 0x00ffffffu; - break; - case PIPE_FORMAT_Z16_UNORM: - spu.fb.zsize = 2; - spu.fb.zscale = (float) 0xffffu; - break; - default: - spu.fb.zsize = 0; - break; - } -} - - -static void -cmd_state_sampler(const struct cell_command_sampler *sampler) -{ - if (Debug) - printf("SPU %u: SAMPLER [%u]\n", - spu.init.id, sampler->unit); - - spu.sampler[sampler->unit] = sampler->state; - if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) - spu.sample_texture[sampler->unit] = sample_texture_bilinear; - else - spu.sample_texture[sampler->unit] = sample_texture_nearest; -} - - -static void -cmd_state_texture(const struct cell_command_texture *texture) -{ - const uint unit = texture->unit; - const uint width = texture->width; - const uint height = texture->height; - - if (Debug) { - printf("SPU %u: TEXTURE [%u] at %p size %u x %u\n", spu.init.id, - texture->unit, texture->start, - texture->width, texture->height); - } - - spu.texture[unit].start = texture->start; - spu.texture[unit].width = width; - spu.texture[unit].height = height; - - spu.texture[unit].tiles_per_row = width / TILE_SIZE; - - spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0}; - spu.texture[unit].tex_size_mask = (vector unsigned int) - { width - 1, height - 1, 0, 0 }; - spu.texture[unit].tex_size_x_mask = spu_splats(width - 1); - spu.texture[unit].tex_size_y_mask = spu_splats(height - 1); -} - - -static void -cmd_state_vertex_info(const struct vertex_info *vinfo) -{ - if (Debug) { - printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id, - vinfo->num_attribs); - } - ASSERT(vinfo->num_attribs >= 1); - ASSERT(vinfo->num_attribs <= 8); - memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo)); -} - - -static void -cmd_state_vs_array_info(const struct cell_array_info *vs_info) -{ - const unsigned attr = vs_info->attr; - - ASSERT(attr < PIPE_MAX_ATTRIBS); - draw.vertex_fetch.src_ptr[attr] = vs_info->base; - draw.vertex_fetch.pitch[attr] = vs_info->pitch; - draw.vertex_fetch.size[attr] = vs_info->size; - draw.vertex_fetch.code_offset[attr] = vs_info->function_offset; - draw.vertex_fetch.dirty = 1; -} - - -static void -cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code) -{ - mfc_get(attribute_fetch_code_buffer, - (unsigned int) code->base, /* src */ - code->size, - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - - draw.vertex_fetch.code = attribute_fetch_code_buffer; -} - - -static void -cmd_finish(void) -{ - if (Debug) - printf("SPU %u: FINISH\n", spu.init.id); - really_clear_tiles(0); - /* wait for all outstanding DMAs to finish */ - mfc_write_tag_mask(~0); - mfc_read_tag_status_all(); - /* send mbox message to PPU */ - spu_write_out_mbox(CELL_CMD_FINISH); -} - - -/** - * Execute a batch of commands which was sent to us by the PPU. - * See the cell_emit_state.c code to see where the commands come from. - * - * The opcode param encodes the location of the buffer and its size. - */ -static void -cmd_batch(uint opcode) -{ - const uint buf = (opcode >> 8) & 0xff; - uint size = (opcode >> 16); - uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB; - const unsigned usize = size / sizeof(buffer[0]); - uint pos; - - if (Debug) - printf("SPU %u: BATCH buffer %u, len %u, from %p\n", - spu.init.id, buf, size, spu.init.buffers[buf]); - - ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH); - - ASSERT_ALIGN16(spu.init.buffers[buf]); - - size = ROUNDUP16(size); - - ASSERT_ALIGN16(spu.init.buffers[buf]); - - mfc_get(buffer, /* dest */ - (unsigned int) spu.init.buffers[buf], /* src */ - size, - TAG_BATCH_BUFFER, - 0, /* tid */ - 0 /* rid */); - wait_on_mask(1 << TAG_BATCH_BUFFER); - - /* Tell PPU we're done copying the buffer to local store */ - if (Debug) - printf("SPU %u: release batch buf %u\n", spu.init.id, buf); - release_buffer(buf); - - /* - * Loop over commands in the batch buffer - */ - for (pos = 0; pos < usize; /* no incr */) { - switch (buffer[pos]) { - /* - * rendering commands - */ - case CELL_CMD_CLEAR_SURFACE: - { - struct cell_command_clear_surface *clr - = (struct cell_command_clear_surface *) &buffer[pos]; - cmd_clear_surface(clr); - pos += sizeof(*clr) / 8; - } - break; - case CELL_CMD_RENDER: - { - struct cell_command_render *render - = (struct cell_command_render *) &buffer[pos]; - uint pos_incr; - cmd_render(render, &pos_incr); - pos += pos_incr; - } - break; - /* - * state-update commands - */ - case CELL_CMD_STATE_FRAMEBUFFER: - { - struct cell_command_framebuffer *fb - = (struct cell_command_framebuffer *) &buffer[pos]; - cmd_state_framebuffer(fb); - pos += sizeof(*fb) / 8; - } - break; - case CELL_CMD_STATE_FRAGMENT_OPS: - { - struct cell_command_fragment_ops *fops - = (struct cell_command_fragment_ops *) &buffer[pos]; - cmd_state_fragment_ops(fops); - pos += sizeof(*fops) / 8; - } - break; - case CELL_CMD_STATE_FRAGMENT_PROGRAM: - { - struct cell_command_fragment_program *fp - = (struct cell_command_fragment_program *) &buffer[pos]; - cmd_state_fragment_program(fp); - pos += sizeof(*fp) / 8; - } - break; - case CELL_CMD_STATE_SAMPLER: - { - struct cell_command_sampler *sampler - = (struct cell_command_sampler *) &buffer[pos]; - cmd_state_sampler(sampler); - pos += sizeof(*sampler) / 8; - } - break; - case CELL_CMD_STATE_TEXTURE: - { - struct cell_command_texture *texture - = (struct cell_command_texture *) &buffer[pos]; - cmd_state_texture(texture); - pos += sizeof(*texture) / 8; - } - break; - case CELL_CMD_STATE_VERTEX_INFO: - cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8); - break; - case CELL_CMD_STATE_VIEWPORT: - (void) memcpy(& draw.viewport, &buffer[pos+1], - sizeof(struct pipe_viewport_state)); - pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8); - break; - case CELL_CMD_STATE_UNIFORMS: - draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1]; - pos += 2; - break; - case CELL_CMD_STATE_VS_ARRAY_INFO: - cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8); - break; - case CELL_CMD_STATE_BIND_VS: -#if 0 - spu_bind_vertex_shader(&draw, - (struct cell_shader_info *) &buffer[pos+1]); -#endif - pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8); - break; - case CELL_CMD_STATE_ATTRIB_FETCH: - cmd_state_attrib_fetch((struct cell_attribute_fetch_code *) - &buffer[pos+1]); - pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8); - break; - /* - * misc commands - */ - case CELL_CMD_FINISH: - cmd_finish(); - pos += 1; - break; - case CELL_CMD_RELEASE_VERTS: - { - struct cell_command_release_verts *release - = (struct cell_command_release_verts *) &buffer[pos]; - cmd_release_verts(release); - pos += sizeof(*release) / 8; - } - break; - case CELL_CMD_FLUSH_BUFFER_RANGE: { - struct cell_buffer_range *br = (struct cell_buffer_range *) - &buffer[pos+1]; - - spu_dcache_mark_dirty((unsigned) br->base, br->size); - pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8); - break; - } - default: - printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]); - ASSERT(0); - break; - } - } - - if (Debug) - printf("SPU %u: BATCH complete\n", spu.init.id); -} - - -/** - * Temporary/simple main loop for SPEs: Get a command, execute it, repeat. - */ -static void -main_loop(void) -{ - struct cell_command cmd; - int exitFlag = 0; - - if (Debug) - printf("SPU %u: Enter main loop\n", spu.init.id); - - ASSERT((sizeof(struct cell_command) & 0xf) == 0); - ASSERT_ALIGN16(&cmd); - - while (!exitFlag) { - unsigned opcode; - int tag = 0; - - if (Debug) - printf("SPU %u: Wait for cmd...\n", spu.init.id); - - /* read/wait from mailbox */ - opcode = (unsigned int) spu_read_in_mbox(); - - if (Debug) - printf("SPU %u: got cmd 0x%x\n", spu.init.id, opcode); - - /* command payload */ - mfc_get(&cmd, /* dest */ - (unsigned int) spu.init.cmd, /* src */ - sizeof(struct cell_command), /* bytes */ - tag, - 0, /* tid */ - 0 /* rid */); - wait_on_mask( 1 << tag ); - - /* - * NOTE: most commands should be contained in a batch buffer - */ - - switch (opcode & CELL_CMD_OPCODE_MASK) { - case CELL_CMD_EXIT: - if (Debug) - printf("SPU %u: EXIT\n", spu.init.id); - exitFlag = 1; - break; - case CELL_CMD_VS_EXECUTE: -#if 0 - spu_execute_vertex_shader(&draw, &cmd.vs); +#if DEBUG +boolean Debug = FALSE; +boolean force_fragment_ops_fallback = TRUE; #endif - break; - case CELL_CMD_BATCH: - cmd_batch(opcode); - break; - default: - printf("Bad opcode!\n"); - } - - } - - if (Debug) - printf("SPU %u: Exit main loop\n", spu.init.id); - - spu_dcache_report(); -} - static void @@ -653,7 +67,8 @@ one_time_init(void) invalidate_tex_cache(); /* Install default/fallback fragment processing function. - * This will normally be overriden by a code-gen'd function. + * This will normally be overriden by a code-gen'd function + * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set. */ spu.fragment_ops = spu_fallback_fragment_ops; } @@ -682,12 +97,15 @@ main(main_param_t speid, main_param_t argp) ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4); ASSERT(sizeof(struct cell_command_render) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0); + ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0); one_time_init(); - if (Debug) - printf("SPU: main() speid=%lu\n", (unsigned long) speid); + DEBUG_PRINTF("main() speid=%lu\n", (unsigned long) speid); + D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n"); + /* get initialization data */ mfc_get(&spu.init, /* dest */ (unsigned int) argp, /* src */ sizeof(struct cell_init_info), /* bytes */ @@ -696,12 +114,16 @@ main(main_param_t speid, main_param_t argp) 0 /* rid */); wait_on_mask( 1 << tag ); + if (spu.init.id == 0) { + return_function_info(); + } + #if 0 if (spu.init.id==0) - spu_test_misc(); + spu_test_misc(spu.init.id); #endif - main_loop(); + command_loop(); return 0; } diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h index 2c7b6258402..29a305232ec 100644 --- a/src/gallium/drivers/cell/spu/spu_main.h +++ b/src/gallium/drivers/cell/spu/spu_main.h @@ -143,13 +143,13 @@ struct spu_global ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB; - /** Current fragment ops machine code */ - uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS]; + /** Current fragment ops machine code, at 8-byte boundary */ + uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB; /** Current fragment ops function */ spu_fragment_ops_func fragment_ops; - /** Current fragment program machine code */ - uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS]; + /** Current fragment program machine code, at 8-byte boundary */ + uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB; /** Current fragment ops function */ spu_fragment_program_func fragment_program; diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c index 03dd547845b..f107764fb28 100644 --- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c +++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c @@ -60,9 +60,12 @@ spu_fallback_fragment_ops(uint x, uint y, vector unsigned int mask) { vector float frag_aos[4]; - unsigned int c0, c1, c2, c3; + unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */ + unsigned int fragc0, fragc1, fragc2, fragc3; /* fragment colors */ - /* do alpha test */ + /* + * Do alpha test + */ if (spu.depth_stencil_alpha.alpha.enabled) { vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref); vector unsigned int amask; @@ -102,7 +105,10 @@ spu_fallback_fragment_ops(uint x, uint y, mask = spu_and(mask, amask); } - /* Z and/or stencil testing... */ + + /* + * Z and/or stencil testing... + */ if (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled) { @@ -178,6 +184,32 @@ spu_fallback_fragment_ops(uint x, uint y, } } + + /* + * If we'll need the current framebuffer/tile colors for blending + * or logicop or colormask, fetch them now. + */ + if (spu.blend.blend_enable || + spu.blend.logicop_enable || + spu.blend.colormask != 0xf) { + +#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ + fbc0 = colorTile->ui[y][x*2+0]; + fbc1 = colorTile->ui[y][x*2+1]; + fbc2 = colorTile->ui[y][x*2+2]; + fbc3 = colorTile->ui[y][x*2+3]; +#else + fbc0 = colorTile->ui[y+0][x+0]; + fbc1 = colorTile->ui[y+0][x+1]; + fbc2 = colorTile->ui[y+1][x+0]; + fbc3 = colorTile->ui[y+1][x+1]; +#endif + } + + + /* + * Do blending + */ if (spu.blend.blend_enable) { /* blending terms, misc regs */ vector float term1r, term1g, term1b, term1a; @@ -186,39 +218,26 @@ spu_fallback_fragment_ops(uint x, uint y, vector float fbRGBA[4]; /* current framebuffer colors */ - /* get colors from framebuffer/tile */ + /* convert framebuffer colors from packed int to vector float */ { - vector float fc[4]; - uint c0, c1, c2, c3; - -#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */ - c0 = colorTile->ui[y][x*2+0]; - c1 = colorTile->ui[y][x*2+1]; - c2 = colorTile->ui[y][x*2+2]; - c3 = colorTile->ui[y][x*2+3]; -#else - c0 = colorTile->ui[y+0][x+0]; - c1 = colorTile->ui[y+0][x+1]; - c2 = colorTile->ui[y+1][x+0]; - c3 = colorTile->ui[y+1][x+1]; -#endif + vector float temp[4]; /* float colors in AOS form */ switch (spu.fb.color_format) { case PIPE_FORMAT_B8G8R8A8_UNORM: - fc[0] = spu_unpack_B8G8R8A8(c0); - fc[1] = spu_unpack_B8G8R8A8(c1); - fc[2] = spu_unpack_B8G8R8A8(c2); - fc[3] = spu_unpack_B8G8R8A8(c3); + temp[0] = spu_unpack_B8G8R8A8(fbc0); + temp[1] = spu_unpack_B8G8R8A8(fbc1); + temp[2] = spu_unpack_B8G8R8A8(fbc2); + temp[3] = spu_unpack_B8G8R8A8(fbc3); break; case PIPE_FORMAT_A8R8G8B8_UNORM: - fc[0] = spu_unpack_A8R8G8B8(c0); - fc[1] = spu_unpack_A8R8G8B8(c1); - fc[2] = spu_unpack_A8R8G8B8(c2); - fc[3] = spu_unpack_A8R8G8B8(c3); + temp[0] = spu_unpack_A8R8G8B8(fbc0); + temp[1] = spu_unpack_A8R8G8B8(fbc1); + temp[2] = spu_unpack_A8R8G8B8(fbc2); + temp[3] = spu_unpack_A8R8G8B8(fbc3); break; default: ASSERT(0); } - _transpose_matrix4x4(fbRGBA, fc); + _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */ } /* @@ -384,21 +403,20 @@ spu_fallback_fragment_ops(uint x, uint y, #endif /* - * Pack float colors into 32-bit RGBA words. + * Pack fragment float colors into 32-bit RGBA words. */ switch (spu.fb.color_format) { case PIPE_FORMAT_A8R8G8B8_UNORM: - c0 = spu_pack_A8R8G8B8(frag_aos[0]); - c1 = spu_pack_A8R8G8B8(frag_aos[1]); - c2 = spu_pack_A8R8G8B8(frag_aos[2]); - c3 = spu_pack_A8R8G8B8(frag_aos[3]); + fragc0 = spu_pack_A8R8G8B8(frag_aos[0]); + fragc1 = spu_pack_A8R8G8B8(frag_aos[1]); + fragc2 = spu_pack_A8R8G8B8(frag_aos[2]); + fragc3 = spu_pack_A8R8G8B8(frag_aos[3]); break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - c0 = spu_pack_B8G8R8A8(frag_aos[0]); - c1 = spu_pack_B8G8R8A8(frag_aos[1]); - c2 = spu_pack_B8G8R8A8(frag_aos[2]); - c3 = spu_pack_B8G8R8A8(frag_aos[3]); + fragc0 = spu_pack_B8G8R8A8(frag_aos[0]); + fragc1 = spu_pack_B8G8R8A8(frag_aos[1]); + fragc2 = spu_pack_B8G8R8A8(frag_aos[2]); + fragc3 = spu_pack_B8G8R8A8(frag_aos[3]); break; default: fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n"); @@ -407,20 +425,57 @@ spu_fallback_fragment_ops(uint x, uint y, /* - * Color masking + * Do color masking */ if (spu.blend.colormask != 0xf) { - /* XXX to do */ - /* apply color mask to 32-bit packed colors */ + uint cmask = 0x0; /* each byte corresponds to a color channel */ + + /* Form bitmask depending on color buffer format and colormask bits */ + switch (spu.fb.color_format) { + case PIPE_FORMAT_A8R8G8B8_UNORM: + if (spu.blend.colormask & (1<<0)) + cmask |= 0x00ff0000; /* red */ + if (spu.blend.colormask & (1<<1)) + cmask |= 0x0000ff00; /* green */ + if (spu.blend.colormask & (1<<2)) + cmask |= 0x000000ff; /* blue */ + if (spu.blend.colormask & (1<<3)) + cmask |= 0xff000000; /* alpha */ + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + if (spu.blend.colormask & (1<<0)) + cmask |= 0x0000ff00; /* red */ + if (spu.blend.colormask & (1<<1)) + cmask |= 0x00ff0000; /* green */ + if (spu.blend.colormask & (1<<2)) + cmask |= 0xff000000; /* blue */ + if (spu.blend.colormask & (1<<3)) + cmask |= 0x000000ff; /* alpha */ + break; + default: + ASSERT(0); + } + + /* + * Apply color mask to the 32-bit packed colors. + * if (cmask[i]) + * frag color[i] = frag color[i]; + * else + * frag color[i] = framebuffer color[i]; + */ + fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask); + fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask); + fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask); + fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask); } /* - * Logic Ops + * Do logic ops */ if (spu.blend.logicop_enable) { /* XXX to do */ - /* apply logicop to 32-bit packed colors */ + /* apply logicop to 32-bit packed colors (fragcx and fbcx) */ } @@ -431,45 +486,46 @@ spu_fallback_fragment_ops(uint x, uint y, spu.cur_ctile_status = TILE_STATUS_DIRTY; } else { + /* write no fragments */ return; } /* - * Write new quad colors to the framebuffer/tile. + * Write new fragment/quad colors to the framebuffer/tile. * Only write pixels where the corresponding mask word is set. */ #if LINEAR_QUAD_LAYOUT /* * Quad layout: * +--+--+--+--+ - * |p0|p1|p2|p3| + * |p0|p1|p2|p3|... * +--+--+--+--+ */ if (spu_extract(mask, 0)) - colorTile->ui[y][x*2] = c0; + colorTile->ui[y][x*2] = fragc0; if (spu_extract(mask, 1)) - colorTile->ui[y][x*2+1] = c1; + colorTile->ui[y][x*2+1] = fragc1; if (spu_extract(mask, 2)) - colorTile->ui[y][x*2+2] = c2; + colorTile->ui[y][x*2+2] = fragc2; if (spu_extract(mask, 3)) - colorTile->ui[y][x*2+3] = c3; + colorTile->ui[y][x*2+3] = fragc3; #else /* * Quad layout: * +--+--+ - * |p0|p1| + * |p0|p1|... * +--+--+ - * |p2|p3| + * |p2|p3|... * +--+--+ */ if (spu_extract(mask, 0)) - colorTile->ui[y+0][x+0] = c0; + colorTile->ui[y+0][x+0] = fragc0; if (spu_extract(mask, 1)) - colorTile->ui[y+0][x+1] = c1; + colorTile->ui[y+0][x+1] = fragc1; if (spu_extract(mask, 2)) - colorTile->ui[y+1][x+0] = c2; + colorTile->ui[y+1][x+0] = fragc2; if (spu_extract(mask, 3)) - colorTile->ui[y+1][x+1] = c3; + colorTile->ui[y+1][x+1] = fragc3; #endif } diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c index 216a33126b7..6905015a483 100644 --- a/src/gallium/drivers/cell/spu/spu_tile.c +++ b/src/gallium/drivers/cell/spu/spu_tile.c @@ -87,3 +87,40 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf) 0 /* rid */); } + +/** + * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled + * tiles back to the main framebuffer. + */ +void +really_clear_tiles(uint surfaceIndex) +{ + const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles; + uint i; + + if (surfaceIndex == 0) { + clear_c_tile(&spu.ctile); + + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) { + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0); + } + } + } + else { + clear_z_tile(&spu.ztile); + + for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) { + uint tx = i % spu.fb.width_tiles; + uint ty = i / spu.fb.width_tiles; + if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR) + put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1); + } + } + +#if 0 + wait_on_mask(1 << TAG_SURFACE_CLEAR); +#endif +} diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h index 1b5491112db..7bfb52be8f3 100644 --- a/src/gallium/drivers/cell/spu/spu_tile.h +++ b/src/gallium/drivers/cell/spu/spu_tile.h @@ -36,12 +36,14 @@ -void +extern void get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf); -void +extern void put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf); +extern void +really_clear_tiles(uint surfaceIndex); static INLINE void diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 8b938781920..0a8fb56a62c 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -241,6 +241,19 @@ eval_coeff(uint slot, float x, float y, vector float result[4]) } +/** + * As above, but return 4 vectors in SOA format. + * XXX this will all be re-written someday. + */ +static INLINE void +eval_coeff_soa(uint slot, float x, float y, vector float result[4]) +{ + eval_coeff(slot, x, y, result); + _transpose_matrix4x4(result, result); +} + + + static INLINE vector float eval_z(float x, float y) { @@ -267,14 +280,17 @@ emit_quad( int x, int y, mask_t mask ) if (spu_extract(spu_orx(mask), 0)) { const int ix = x - setup.cliprect_minx; const int iy = y - setup.cliprect_miny; - vector float colors[4]; spu.cur_ctile_status = TILE_STATUS_DIRTY; spu.cur_ztile_status = TILE_STATUS_DIRTY; if (spu.texture[0].start) { - /* texture mapping */ + /* + * Temporary texture mapping path + * This will go away when fragment programs support TEX inst. + */ const uint unit = 0; + vector float colors[4]; vector float texcoords[4]; eval_coeff(2, (float) x, (float) y, texcoords); @@ -311,70 +327,60 @@ emit_quad( int x, int y, mask_t mask ) colors[3] = spu_mul(colors[3], colors1[3]); } + { + /* Convert fragment data from AoS to SoA format. + * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) + * This is temporary! + */ + vector float soa_frag[4]; + _transpose_matrix4x4(soa_frag, colors); + + vector float fragZ = eval_z((float) x, (float) y); + + /* Do all per-fragment/quad operations here, including: + * alpha test, z test, stencil test, blend and framebuffer writing. + */ + spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, + fragZ, + soa_frag[0], soa_frag[1], + soa_frag[2], soa_frag[3], + mask); + } + } else { - /* simple shading */ -#if 0 - eval_coeff(1, (float) x, (float) y, colors); + /* + * Run fragment shader, execute per-fragment ops, update fb/tile. + */ + vector float inputs[4*4], outputs[2*4]; + vector float fragZ = eval_z((float) x, (float) y); + /* setup inputs */ +#if 0 + eval_coeff_soa(1, (float) x, (float) y, inputs); #else - /* XXX new fragment program code */ - - if (spu.fragment_program) { - vector float inputs[4*4], outputs[2*4]; - - /* setup inputs */ - eval_coeff(1, (float) x, (float) y, inputs); - - /* Execute the current fragment program */ - spu.fragment_program(inputs, outputs, spu.constants); - - /* Copy outputs */ - colors[0] = outputs[0*4+0]; - colors[1] = outputs[0*4+1]; - colors[2] = outputs[0*4+2]; - colors[3] = outputs[0*4+3]; - - if (0 && spu.init.id==0 && y == 48) { - printf("colors[0] = %f %f %f %f\n", - spu_extract(colors[0], 0), - spu_extract(colors[0], 1), - spu_extract(colors[0], 2), - spu_extract(colors[0], 3)); - printf("colors[1] = %f %f %f %f\n", - spu_extract(colors[1], 0), - spu_extract(colors[1], 1), - spu_extract(colors[1], 2), - spu_extract(colors[1], 3)); - } - + uint i; + for (i = 0; i < spu.vertex_info.num_attribs; i++) { + eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4); } #endif - } + ASSERT(spu.fragment_program); + ASSERT(spu.fragment_ops); + /* Execute the current fragment program */ + spu.fragment_program(inputs, outputs, spu.constants); - { - /* Convert fragment data from AoS to SoA format. - * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA) - * This is temporary! - */ - vector float soa_frag[4]; - _transpose_matrix4x4(soa_frag, colors); - - float4 fragZ; - - fragZ.v = eval_z((float) x, (float) y); - - /* Do all per-fragment/quad operations here, including: - * alpha test, z test, stencil test, blend and framebuffer writing. + /* Execute per-fragment/quad operations, including: + * alpha test, z test, stencil test, blend and framebuffer writing. */ spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile, - fragZ.v, - soa_frag[0], soa_frag[1], - soa_frag[2], soa_frag[3], + fragZ, + outputs[0*4+0], + outputs[0*4+1], + outputs[0*4+2], + outputs[0*4+3], mask); } - } } diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index 701ee4c72f2..4fea71c3140 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -39,11 +39,20 @@ #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_parse.h" -struct sp_exec_fragment_shader { +struct sp_exec_fragment_shader +{ struct sp_fragment_shader base; + const struct tgsi_token *machine_tokens; }; +/** cast wrapper */ +static INLINE struct sp_exec_fragment_shader * +sp_exec_fragment_shader(const struct sp_fragment_shader *base) +{ + return (struct sp_exec_fragment_shader *) base; +} + /** * Compute quad X,Y,Z,W for the four fragments in a quad. @@ -86,10 +95,20 @@ exec_prepare( const struct sp_fragment_shader *base, struct tgsi_exec_machine *machine, struct tgsi_sampler *samplers ) { - tgsi_exec_machine_bind_shader( machine, - base->shader.tokens, - PIPE_MAX_SAMPLERS, - samplers ); + struct sp_exec_fragment_shader *spefs = + sp_exec_fragment_shader(base); + + /* + * Bind tokens/shader to the interpreter's machine state. + * Avoid redundant binding. + */ + if (spefs->machine_tokens != base->shader.tokens) { + tgsi_exec_machine_bind_shader( machine, + base->shader.tokens, + PIPE_MAX_SAMPLERS, + samplers ); + spefs->machine_tokens = base->shader.tokens; + } } diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README index e7a2f12b022..f0e1cd596d3 100644 --- a/src/gallium/drivers/trace/README +++ b/src/gallium/drivers/trace/README @@ -10,7 +10,7 @@ This directory contains a Gallium3D pipe driver which traces all incoming calls. To build, invoke scons on the top dir as - scons statetrackers=mesa drivers=softpipe,i915simple,trace winsys=xlib + scons statetrackers=mesa drivers=softpipe,i965simple,trace winsys=xlib = Usage = |