diff options
author | Keith Whitwell <[email protected]> | 2009-01-09 10:08:06 +0000 |
---|---|---|
committer | Keith Whitwell <[email protected]> | 2009-01-09 10:08:06 +0000 |
commit | e3734593aea202e99e77febea7b86c904080939f (patch) | |
tree | 69856674e2062c55ec5eec95eb0e31225a943084 /src/gallium/drivers | |
parent | 221352bbd79a0ea92ce31cffb65537f62ee5668e (diff) | |
parent | 5cad143e545775acacee294049345c6a3868c51d (diff) |
Merge commit 'origin/gallium-0.2' into gallium-xlib-rework
Conflicts:
progs/glsl/Makefile
Diffstat (limited to 'src/gallium/drivers')
20 files changed, 1143 insertions, 816 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c index 22d552d8e3d..8f502823f99 100644 --- a/src/gallium/drivers/cell/ppu/cell_context.c +++ b/src/gallium/drivers/cell/ppu/cell_context.c @@ -162,7 +162,7 @@ cell_create_context(struct pipe_screen *screen, */ /* This call only works with SDK 3.0. Anyone still using 2.1??? */ cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1); - cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0); + cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (cell->debug_flags) { printf("Cell: found %d Cell(s) with %u SPUs\n", cell->num_cells, cell->num_spus); diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 96a1743fc10..8f3deb482e6 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -2,6 +2,7 @@ * * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. + * Copyright 2009 VMware, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the @@ -75,13 +76,25 @@ struct codegen int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ + int addr_reg; /**< address register, integer values */ + /** Per-instruction temps / intermediate temps */ int num_itemps; int itemps[12]; /** Current IF/ELSE/ENDIF nesting level */ int if_nesting; - /** Index of execution mask register */ + /** Current BGNLOOP/ENDLOOP nesting level */ + int loop_nesting; + /** Location of start of current loop */ + int loop_start; + + /** Index of if/conditional mask register */ + int cond_mask_reg; + /** Index of loop mask register */ + int loop_mask_reg; + + /** Index of master execution mask register */ int exec_mask_reg; /** KIL mask: indicates which fragments have been killed */ @@ -145,10 +158,33 @@ get_const_one_reg(struct codegen *gen) /** - * Return index of the pixel execution mask. + * Return index of the address register. + * Used for indirect register loads/stores. + */ +static int +get_address_reg(struct codegen *gen) +{ + if (gen->addr_reg <= 0) { + gen->addr_reg = spe_allocate_available_register(gen->f); + + spe_indent(gen->f, 4); + spe_comment(gen->f, -4, "INIT CONSTANT 1.0:"); + + /* init addr = {0, 0, 0, 0} */ + spe_zero(gen->f, gen->addr_reg); + + spe_indent(gen->f, -4); + } + + return gen->addr_reg; +} + + +/** + * Return index of the master execution mask. * The register is allocated an initialized upon the first call. * - * The pixel execution mask controls which pixels in a quad are + * The master execution mask controls which pixels in a quad are * modified, according to surrounding conditionals, loops, etc. */ static int @@ -157,19 +193,40 @@ get_exec_mask_reg(struct codegen *gen) if (gen->exec_mask_reg <= 0) { gen->exec_mask_reg = spe_allocate_available_register(gen->f); - spe_indent(gen->f, 4); - spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:"); - - /* exec_mask = {~0, ~0, ~0, ~0} */ + /* XXX this may not be needed */ + spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0"); spe_load_int(gen->f, gen->exec_mask_reg, ~0); - - spe_indent(gen->f, -4); } return gen->exec_mask_reg; } +/** Return index of the conditional (if/else) execution mask register */ +static int +get_cond_mask_reg(struct codegen *gen) +{ + if (gen->cond_mask_reg <= 0) { + gen->cond_mask_reg = spe_allocate_available_register(gen->f); + } + + return gen->cond_mask_reg; +} + + +/** Return index of the loop execution mask register */ +static int +get_loop_mask_reg(struct codegen *gen) +{ + if (gen->loop_mask_reg <= 0) { + gen->loop_mask_reg = spe_allocate_available_register(gen->f); + } + + return gen->loop_mask_reg; +} + + + static boolean is_register_src(struct codegen *gen, int channel, const struct tgsi_full_src_register *src) @@ -231,16 +288,22 @@ get_src_reg(struct codegen *gen, spe_xor(gen->f, reg, reg, reg); } else { + int index = src->SrcRegister.Index; + assert(swizzle < 4); + if (src->SrcRegister.Indirect) { + /* XXX unfinished */ + } + switch (src->SrcRegister.File) { case TGSI_FILE_TEMPORARY: - reg = gen->temp_regs[src->SrcRegister.Index][swizzle]; + reg = gen->temp_regs[index][swizzle]; break; case TGSI_FILE_INPUT: { /* offset is measured in quadwords, not bytes */ - int offset = src->SrcRegister.Index * 4 + swizzle; + int offset = index * 4 + swizzle; reg = get_itemp(gen); reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ @@ -248,12 +311,12 @@ get_src_reg(struct codegen *gen, } break; case TGSI_FILE_IMMEDIATE: - reg = gen->imm_regs[src->SrcRegister.Index][swizzle]; + reg = gen->imm_regs[index][swizzle]; break; case TGSI_FILE_CONSTANT: { /* offset is measured in quadwords, not bytes */ - int offset = src->SrcRegister.Index * 4 + swizzle; + int offset = index * 4 + swizzle; reg = get_itemp(gen); reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ @@ -322,7 +385,7 @@ get_dst_reg(struct codegen *gen, switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - if (gen->if_nesting > 0) + if (gen->if_nesting > 0 || gen->loop_nesting > 0) reg = get_itemp(gen); else reg = gen->temp_regs[dest->DstRegister.Index][channel]; @@ -367,7 +430,7 @@ store_dest_reg(struct codegen *gen, switch (dest->DstRegister.File) { case TGSI_FILE_TEMPORARY: - if (gen->if_nesting > 0) { + if (gen->if_nesting > 0 || gen->loop_nesting > 0) { int d_reg = gen->temp_regs[dest->DstRegister.Index][channel]; int exec_reg = get_exec_mask_reg(gen); /* Mix d with new value according to exec mask: @@ -384,7 +447,7 @@ store_dest_reg(struct codegen *gen, { /* offset is measured in quadwords, not bytes */ int offset = dest->DstRegister.Index * 4 + channel; - if (gen->if_nesting > 0) { + if (gen->if_nesting > 0 || gen->loop_nesting > 0) { int exec_reg = get_exec_mask_reg(gen); int curval_reg = get_itemp(gen); /* First read the current value from memory: @@ -488,96 +551,118 @@ emit_epilogue(struct codegen *gen) } +#define FOR_EACH_ENABLED_CHANNEL(inst, ch) \ + for (ch = 0; ch < 4; ch++) \ + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) + + +static boolean +emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int ch = 0, src_reg, addr_reg; + + spe_comment(gen->f, -4, "ARL:"); + + src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + addr_reg = get_address_reg(gen); + + /* convert float to int */ + spe_cflts(gen->f, addr_reg, src_reg, 0); + + free_itemps(gen); + + return TRUE; +} + + static boolean emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, src_reg[4], dst_reg[4]; spe_comment(gen->f, -4, "MOV:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) && - is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) { - /* special-case: register to memory store */ - store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]); - } - else { - spe_move(gen->f, dst_reg[ch], src_reg[ch]); - store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]); - } - free_itemps(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) && + is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) { + /* special-case: register to memory store */ + store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]); + } + else { + spe_move(gen->f, dst_reg[ch], src_reg[ch]); + store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]); } } - return true; + + free_itemps(gen); + + return TRUE; } /** - * Emit addition instructions. Recall that a single TGSI_OPCODE_ADD - * becomes (up to) four SPU "fa" instructions because we're doing SOA - * processing. + * Emit binary operation */ static boolean -emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) +emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], d_reg[4]; - spe_comment(gen->f, -4, "ADD:"); + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + spe_comment(gen->f, -4, "ADD:"); + break; + case TGSI_OPCODE_SUB: + spe_comment(gen->f, -4, "SUB:"); + break; + case TGSI_OPCODE_MUL: + spe_comment(gen->f, -4, "MUL:"); + break; + default: + assert(0); + } + /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */ - for (ch = 0; ch < 4; ch++) { - /* If the dest R, G, B or A writemask is enabled... */ - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); } - /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* Emit actual SPE instruction: d = s1 + s2 */ + + /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* Emit actual SPE instruction: d = s1 + s2 */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ADD: spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); - /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - /* Free any intermediate temps we allocated */ - free_itemps(gen); + break; + case TGSI_OPCODE_SUB: + spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_MUL: + spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + default: + ; } } - return true; -} -/** - * Emit subtract. See emit_ADD for comments. - */ -static boolean -emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch, s1_reg[4], s2_reg[4], d_reg[4]; - spe_comment(gen->f, -4, "SUB:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - } + /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* d = s1 - s2 */ - spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } - } - return true; + + /* Free any intermediate temps we allocated */ + free_itemps(gen); + + return TRUE; } + /** * Emit multiply add. See emit_ADD for comments. */ @@ -586,23 +671,20 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4]; spe_comment(gen->f, -4, "MAD:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* d = s1 * s2 + s3 */ - spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]); - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - return true; + free_itemps(gen); + return TRUE; } @@ -615,132 +697,108 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4]; spe_comment(gen->f, -4, "LERP:"); /* setup/get src/dst/temp regs */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - tmp_reg[ch] = get_itemp(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); } /* d = s3 + s1(s2 - s3) */ /* do all subtracts, then all fma, then all stores to better pipeline */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } + + /** - * Emit multiply. See emit_ADD for comments. + * Emit reciprocal or recip sqrt. */ static boolean -emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) +emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch, s1_reg[4], s2_reg[4], d_reg[4]; - spe_comment(gen->f, -4, "MUL:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - } + int ch, s1_reg[4], d_reg[4], tmp_reg[4]; + + if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) { + spe_comment(gen->f, -4, "RCP:"); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* d = s1 * s2 */ - spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + else { + assert(inst->Instruction.Opcode == TGSI_OPCODE_RSQ); + spe_comment(gen->f, -4, "RSQ:"); } - return true; -} -/** - * Emit reciprocal. See emit_ADD for comments. - */ -static boolean -emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - spe_comment(gen->f, -4, "RCP:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* d = 1/s1 */ - spe_frest(gen->f, d_reg, s1_reg); - spe_fi(gen->f, d_reg, s1_reg, d_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); } - return true; -} -/** - * Emit reciprocal sqrt. See emit_ADD for comments. - */ -static boolean -emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - spe_comment(gen->f, -4, "RSQ:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* d = 1/s1 */ - spe_frsqest(gen->f, d_reg, s1_reg); - spe_fi(gen->f, d_reg, s1_reg, d_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) { + /* tmp = 1/s1 */ + spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]); + } + else { + /* tmp = 1/sqrt(s1) */ + spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]); } } - return true; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* d = float_interp(s1, tmp) */ + spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); + } + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; } + /** * Emit absolute value. See emit_ADD for comments. */ static boolean emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], d_reg[4]; + const int bit31mask_reg = get_itemp(gen); + spe_comment(gen->f, -4, "ABS:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - const int bit31mask_reg = get_itemp(gen); - /* mask with bit 31 set, the rest cleared */ - spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); + /* mask with bit 31 set, the rest cleared */ + spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); - /* d = sign bit cleared in s1 */ - spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + /* d = sign bit cleared in s1 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg); } - return true; + + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + + free_itemps(gen); + return TRUE; } /** @@ -775,16 +833,14 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) /* t0 = t0 + t1 */ spe_fa(gen->f, t0_reg, t0_reg, t1_reg); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - spe_move(gen->f, d_reg, t0_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } /** @@ -824,16 +880,14 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) /* t0 = t0 + t1 */ spe_fa(gen->f, t0_reg, t0_reg, t1_reg); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - spe_move(gen->f, d_reg, t0_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } /** @@ -867,16 +921,14 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) /* t = w1 + t */ spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - spe_move(gen->f, d_reg, tmp_reg); - store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, tmp_reg); + store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } /** @@ -911,17 +963,15 @@ emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst) spe_frsqest(gen->f, t1_reg, t0_reg); spe_fi(gen->f, t1_reg, t0_reg, t1_reg); - for (ch = 0; ch < 3; ch++) { /* NOTE: omit W channel */ - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* dst = src[ch] * t1 */ - spe_fm(gen->f, d_reg, src_reg[ch], t1_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + /* dst = src[ch] * t1 */ + spe_fm(gen->f, d_reg, src_reg[ch], t1_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } @@ -978,201 +1028,101 @@ emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst) } free_itemps(gen); - return true; + return TRUE; } + /** - * Emit set-if-greater-than. + * Emit inequality instruction. * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as * the result but OpenGL/TGSI needs 0.0 and 1.0 results. * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. */ static boolean -emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst) +emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg; + bool complement = FALSE; - spe_comment(gen->f, -4, "SGT:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 > s2) */ - spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = d & one_reg */ - spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_SGT: + spe_comment(gen->f, -4, "SGT:"); + break; + case TGSI_OPCODE_SLT: + spe_comment(gen->f, -4, "SLT:"); + break; + case TGSI_OPCODE_SGE: + spe_comment(gen->f, -4, "SGE:"); + complement = TRUE; + break; + case TGSI_OPCODE_SLE: + spe_comment(gen->f, -4, "SLE:"); + complement = TRUE; + break; + case TGSI_OPCODE_SEQ: + spe_comment(gen->f, -4, "SEQ:"); + break; + case TGSI_OPCODE_SNE: + spe_comment(gen->f, -4, "SNE:"); + complement = TRUE; + break; + default: + ; } - return true; -} + one_reg = get_const_one_reg(gen); -/** - * Emit set-if_less-then. See emit_SGT for comments. - */ -static boolean -emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - - spe_comment(gen->f, -4, "SLT:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 < s2) */ - spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = d & one_reg */ - spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); } - return true; -} - -/** - * Emit set-if_greater-then-or-equal. See emit_SGT for comments. - */ -static boolean -emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - - spe_comment(gen->f, -4, "SGE:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 >= s2) */ - spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = ~d & one_reg */ - spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_SGT: + spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SLT: + spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); + break; + case TGSI_OPCODE_SGE: + spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); + break; + case TGSI_OPCODE_SLE: + spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SEQ: + spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + case TGSI_OPCODE_SNE: + spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + break; + default: + assert(0); } } - return true; -} - -/** - * Emit set-if_less-then-or-equal. See emit_SGT for comments. - */ -static boolean -emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - - spe_comment(gen->f, -4, "SLE:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 <= s2) */ - spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = ~d & one_reg */ - spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + /* convert d from 0x0/0xffffffff to 0.0/1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + /* d = d & one_reg */ + if (complement) + spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]); + else + spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]); } - return true; -} - -/** - * Emit set-if_equal. See emit_SGT for comments. - */ -static boolean -emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - - spe_comment(gen->f, -4, "SEQ:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 == s2) */ - spe_fceq(gen->f, d_reg, s1_reg, s2_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = d & one_reg */ - spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - return true; + free_itemps(gen); + return TRUE; } -/** - * Emit set-if_not_equal. See emit_SGT for comments. - */ -static boolean -emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst) -{ - int ch; - - spe_comment(gen->f, -4, "SNE:"); - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* d = (s1 != s2) */ - spe_fceq(gen->f, d_reg, s1_reg, s2_reg); - spe_nor(gen->f, d_reg, d_reg, d_reg); - - /* convert d from 0x0/0xffffffff to 0.0/1.0 */ - /* d = d & one_reg */ - spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen)); - - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } - } - - return true; -} /** - * Emit compare. See emit_SGT for comments. + * Emit compare. */ static boolean emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) @@ -1181,26 +1131,24 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) spe_comment(gen->f, -4, "CMP:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - int zero_reg = get_itemp(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + int zero_reg = get_itemp(gen); - spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + spe_zero(gen->f, zero_reg); - /* d = (s1 < 0) ? s2 : s3 */ - spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); - spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); + /* d = (s1 < 0) ? s2 : s3 */ + spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); + spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); } - return true; + return TRUE; } /** @@ -1211,29 +1159,34 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], d_reg[4]; spe_comment(gen->f, -4, "TRUNC:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } - /* Convert float to int */ - spe_cflts(gen->f, d_reg, s1_reg, 0); + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0); + } - /* Convert int to float */ - spe_csflt(gen->f, d_reg, d_reg, 0); + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0); + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - return true; + free_itemps(gen); + return TRUE; } + /** * Emit floor. * If negative int subtract one @@ -1243,77 +1196,103 @@ emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; spe_comment(gen->f, -4, "FLR:"); - int zero_reg = get_itemp(gen); - spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + zero_reg = get_itemp(gen); + spe_zero(gen->f, zero_reg); + one_reg = get_const_one_reg(gen); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - int tmp_reg = get_itemp(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } - /* If negative, subtract 1.0 */ - spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg); - spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg); - spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg); + /* If negative, subtract 1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); + } - /* Convert float to int */ - spe_cflts(gen->f, tmp_reg, tmp_reg, 0); + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } - /* Convert int to float */ - spe_csflt(gen->f, d_reg, tmp_reg, 0); + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0); + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - return true; + free_itemps(gen); + return TRUE; } + /** * Compute frac = Input - FLR(Input) */ static boolean emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; spe_comment(gen->f, -4, "FRC:"); - int zero_reg = get_itemp(gen); - spe_xor(gen->f, zero_reg, zero_reg, zero_reg); + zero_reg = get_itemp(gen); + spe_zero(gen->f, zero_reg); + one_reg = get_const_one_reg(gen); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - int tmp_reg = get_itemp(gen); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } - /* If negative, subtract 1.0 */ - spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg); - spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg); - spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg); + /* If negative, subtract 1.0 */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); + } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); + } - /* Convert float to int */ - spe_cflts(gen->f, tmp_reg, tmp_reg, 0); + /* Convert float to int */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } - /* Convert int to float */ - spe_csflt(gen->f, tmp_reg, tmp_reg, 0); + /* Convert int to float */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0); + } - /* d = s1 - FLR(s1) */ - spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + /* d = s1 - FLR(s1) */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + /* store result */ + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } - return true; + free_itemps(gen); + return TRUE; } @@ -1379,73 +1358,71 @@ emit_function_call(struct codegen *gen, retval_reg = spe_allocate_available_register(gen->f); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg; - ubyte usedRegs[SPE_NUM_REGS]; - uint i, numUsed; - - if (!scalar) { - for (a = 0; a < num_args; a++) { - s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int d_reg; + ubyte usedRegs[SPE_NUM_REGS]; + uint i, numUsed; + + if (!scalar) { + for (a = 0; a < num_args; a++) { + s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); } + } - d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - if (!scalar || !func_called) { - /* for a scalar function, we'll really only call the function once */ + if (!scalar || !func_called) { + /* for a scalar function, we'll really only call the function once */ - numUsed = spe_get_registers_used(gen->f, usedRegs); - assert(numUsed < gen->frame_size / 16 - 2); + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); - /* save registers to stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - int offset = 2 + i; - spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); - } + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } - /* setup function arguments */ - for (a = 0; a < num_args; a++) { - spe_move(gen->f, 3 + a, s_regs[a]); - } + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } - /* branch to function, save return addr */ - spe_brasl(gen->f, SPE_REG_RA, addr); - - /* save function's return value */ - if (scalar) - spe_move(gen->f, retval_reg, 3); - else - spe_move(gen->f, d_reg, 3); - - /* restore registers from stack */ - for (i = 0; i < numUsed; i++) { - uint reg = usedRegs[i]; - if (reg != d_reg && reg != retval_reg) { - int offset = 2 + i; - spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); - } - } + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); - func_called = TRUE; - } + /* save function's return value */ + if (scalar) + spe_move(gen->f, retval_reg, 3); + else + spe_move(gen->f, d_reg, 3); - if (scalar) { - spe_move(gen->f, d_reg, retval_reg); + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg && reg != retval_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + func_called = TRUE; + } + + if (scalar) { + spe_move(gen->f, d_reg, retval_reg); } + + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + free_itemps(gen); } if (scalar) { spe_release_register(gen->f, retval_reg); } - return true; + return TRUE; } @@ -1525,11 +1502,9 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst) } } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]); - free_itemps(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]); + free_itemps(gen); } return TRUE; @@ -1549,35 +1524,31 @@ emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst) /* zero = {0,0,0,0} */ zero_reg = get_itemp(gen); - spe_load_uint(gen->f, zero_reg, 0); + spe_zero(gen->f, zero_reg); cmp_reg = get_itemp(gen); /* get src regs */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); } /* test if any src regs are < 0 */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - if (kil_reg >= 0) { - /* cmp = 0 > src ? : ~0 : 0 */ - spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]); - /* kil = kil | cmp */ - spe_or(gen->f, kil_reg, kil_reg, cmp_reg); - } - else { - kil_reg = get_itemp(gen); - /* kil = 0 > src ? : ~0 : 0 */ - spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (kil_reg >= 0) { + /* cmp = 0 > src ? : ~0 : 0 */ + spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]); + /* kil = kil | cmp */ + spe_or(gen->f, kil_reg, kil_reg, cmp_reg); + } + else { + kil_reg = get_itemp(gen); + /* kil = 0 > src ? : ~0 : 0 */ + spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]); } } - if (gen->if_nesting) { + if (gen->if_nesting || gen->loop_nesting) { /* may have been a conditional kil */ spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg); } @@ -1599,96 +1570,92 @@ emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst) /** - * Emit max. See emit_SGT for comments. + * Emit min or max. */ static boolean -emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) +emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; spe_comment(gen->f, -4, "MAX:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - tmp_reg[ch] = get_itemp(gen); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); } /* d = (s0 > s1) ? s0 : s1 */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + if (inst->Instruction.Opcode == TGSI_OPCODE_MAX) spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]); - } + else + spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - } + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } free_itemps(gen); - return true; + return TRUE; } + /** - * Emit max. See emit_SGT for comments. + * Emit code to update the execution mask. + * This needs to be done whenever the execution status of a conditional + * or loop is changed. */ -static boolean -emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst) +static void +emit_update_exec_mask(struct codegen *gen) { - int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; + const int exec_reg = get_exec_mask_reg(gen); + const int cond_reg = gen->cond_mask_reg; + const int loop_reg = gen->loop_mask_reg; - spe_comment(gen->f, -4, "MIN:"); + spe_comment(gen->f, 0, "Update master execution mask"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - tmp_reg[ch] = get_itemp(gen); - } + if (gen->if_nesting > 0 && gen->loop_nesting > 0) { + /* exec_mask = cond_mask & loop_mask */ + assert(cond_reg > 0); + assert(loop_reg > 0); + spe_and(gen->f, exec_reg, cond_reg, loop_reg); } - - /* d = (s1 > s0) ? s0 : s1 */ - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]); - } + else if (gen->if_nesting > 0) { + assert(cond_reg > 0); + spe_move(gen->f, exec_reg, cond_reg); } - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); - } + else if (gen->loop_nesting > 0) { + assert(loop_reg > 0); + spe_move(gen->f, exec_reg, loop_reg); } - - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); - } + else { + spe_load_int(gen->f, exec_reg, ~0x0); } - - free_itemps(gen); - return true; } + static boolean emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) { const int channel = 0; - const int exec_reg = get_exec_mask_reg(gen); + int cond_reg; spe_comment(gen->f, -4, "IF:"); - /* update execution mask with the predicate register */ + cond_reg = get_cond_mask_reg(gen); + + /* XXX push cond exec mask */ + + spe_comment(gen->f, 0, "init conditional exec mask = ~0:"); + spe_load_int(gen->f, cond_reg, ~0); + + /* update conditional execution mask with the predicate register */ int tmp_reg = get_itemp(gen); int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]); @@ -1696,44 +1663,126 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) spe_ceqi(gen->f, tmp_reg, s1_reg, 0); /* tmp = !tmp */ spe_complement(gen->f, tmp_reg, tmp_reg); - /* exec_mask = exec_mask & tmp */ - spe_and(gen->f, exec_reg, exec_reg, tmp_reg); + /* cond_mask = cond_mask & tmp */ + spe_and(gen->f, cond_reg, cond_reg, tmp_reg); gen->if_nesting++; + /* update the master execution mask */ + emit_update_exec_mask(gen); + free_itemps(gen); - return true; + return TRUE; } static boolean emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst) { - const int exec_reg = get_exec_mask_reg(gen); + const int cond_reg = get_cond_mask_reg(gen); spe_comment(gen->f, -4, "ELSE:"); - /* exec_mask = !exec_mask */ - spe_complement(gen->f, exec_reg, exec_reg); + spe_comment(gen->f, 0, "cond exec mask = !cond exec mask"); + spe_complement(gen->f, cond_reg, cond_reg); + emit_update_exec_mask(gen); - return true; + return TRUE; } static boolean emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst) { + spe_comment(gen->f, -4, "ENDIF:"); + + /* XXX todo: pop cond exec mask */ + + gen->if_nesting--; + + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + int exec_reg, loop_reg; + + spe_comment(gen->f, -4, "BGNLOOP:"); + + exec_reg = get_exec_mask_reg(gen); + loop_reg = get_loop_mask_reg(gen); + + /* XXX push loop_exec mask */ + + spe_comment(gen->f, 0*-4, "initialize loop exec mask = ~0"); + spe_load_int(gen->f, loop_reg, ~0x0); + + gen->loop_nesting++; + gen->loop_start = spe_code_size(gen->f); /* in bytes */ + + return TRUE; +} + + +static boolean +emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const int loop_reg = get_loop_mask_reg(gen); + const int tmp_reg = get_itemp(gen); + int offset; + + spe_comment(gen->f, -4, "ENDLOOP:"); + + /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */ + spe_orx(gen->f, tmp_reg, loop_reg); + + offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */ + + /* branch back to top of loop if tmp_reg != 0 */ + spe_brnz(gen->f, tmp_reg, offset / 4); + + /* XXX pop loop_exec mask */ + + gen->loop_nesting--; + + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ const int exec_reg = get_exec_mask_reg(gen); + const int loop_reg = get_loop_mask_reg(gen); - spe_comment(gen->f, -4, "ENDIF:"); + spe_comment(gen->f, -4, "BREAK:"); - /* XXX todo: pop execution mask */ + assert(gen->loop_nesting > 0); - spe_load_int(gen->f, exec_reg, ~0x0); + spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask"); + spe_andc(gen->f, loop_reg, loop_reg, exec_reg); - gen->if_nesting--; - return true; + emit_update_exec_mask(gen); + + return TRUE; +} + + +static boolean +emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + spe_comment(gen->f, -4, "CONT:"); + + assert(gen->loop_nesting > 0); + + return TRUE; } @@ -1745,28 +1794,26 @@ emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst, spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:"); - for (ch = 0; ch < 4; ch++) { - if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + FOR_EACH_ENABLED_CHANNEL(inst, ch) { + int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - int t1_reg = get_itemp(gen); - int t2_reg = get_itemp(gen); + int t1_reg = get_itemp(gen); + int t2_reg = get_itemp(gen); - spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ - if (ddx) { - spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ - } - else { - spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ - } - spe_fs(gen->f, d_reg, t2_reg, t1_reg); - - free_itemps(gen); + spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ + if (ddx) { + spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ + } + else { + spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ } + spe_fs(gen->f, d_reg, t2_reg, t1_reg); + + free_itemps(gen); } - return true; + return TRUE; } @@ -1784,7 +1831,7 @@ emit_END(struct codegen *gen) { spe_comment(gen->f, -4, "END:"); emit_epilogue(gen); - return true; + return TRUE; } @@ -1796,15 +1843,15 @@ emit_instruction(struct codegen *gen, const struct tgsi_full_instruction *inst) { switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + return emit_ARL(gen, inst); case TGSI_OPCODE_MOV: case TGSI_OPCODE_SWZ: return emit_MOV(gen, inst); - case TGSI_OPCODE_MUL: - return emit_MUL(gen, inst); case TGSI_OPCODE_ADD: - return emit_ADD(gen, inst); case TGSI_OPCODE_SUB: - return emit_SUB(gen, inst); + case TGSI_OPCODE_MUL: + return emit_binop(gen, inst); case TGSI_OPCODE_MAD: return emit_MAD(gen, inst); case TGSI_OPCODE_LERP: @@ -1820,29 +1867,22 @@ emit_instruction(struct codegen *gen, case TGSI_OPCODE_XPD: return emit_XPD(gen, inst); case TGSI_OPCODE_RCP: - return emit_RCP(gen, inst); case TGSI_OPCODE_RSQ: - return emit_RSQ(gen, inst); + return emit_RCP_RSQ(gen, inst); case TGSI_OPCODE_ABS: return emit_ABS(gen, inst); case TGSI_OPCODE_SGT: - return emit_SGT(gen, inst); case TGSI_OPCODE_SLT: - return emit_SLT(gen, inst); case TGSI_OPCODE_SGE: - return emit_SGE(gen, inst); case TGSI_OPCODE_SLE: - return emit_SLE(gen, inst); case TGSI_OPCODE_SEQ: - return emit_SEQ(gen, inst); case TGSI_OPCODE_SNE: - return emit_SNE(gen, inst); + return emit_inequality(gen, inst); case TGSI_OPCODE_CMP: return emit_CMP(gen, inst); - case TGSI_OPCODE_MAX: - return emit_MAX(gen, inst); case TGSI_OPCODE_MIN: - return emit_MIN(gen, inst); + case TGSI_OPCODE_MAX: + return emit_MIN_MAX(gen, inst); case TGSI_OPCODE_TRUNC: return emit_TRUNC(gen, inst); case TGSI_OPCODE_FLR: @@ -1882,20 +1922,29 @@ emit_instruction(struct codegen *gen, case TGSI_OPCODE_ENDIF: return emit_ENDIF(gen, inst); + case TGSI_OPCODE_BGNLOOP2: + return emit_BGNLOOP(gen, inst); + case TGSI_OPCODE_ENDLOOP2: + return emit_ENDLOOP(gen, inst); + case TGSI_OPCODE_BRK: + return emit_BRK(gen, inst); + case TGSI_OPCODE_CONT: + return emit_CONT(gen, inst); + case TGSI_OPCODE_DDX: - return emit_DDX_DDY(gen, inst, true); + return emit_DDX_DDY(gen, inst, TRUE); case TGSI_OPCODE_DDY: - return emit_DDX_DDY(gen, inst, false); + return emit_DDX_DDY(gen, inst, FALSE); /* XXX lots more cases to do... */ default: fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", inst->Instruction.Opcode); - return false; + return FALSE; } - return true; + return TRUE; } @@ -1923,10 +1972,14 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1]; } else { + char str[100]; int reg = spe_allocate_available_register(gen->f); if (reg < 0) - return false; + return FALSE; + + sprintf(str, "init $%d = %f", reg, val); + spe_comment(gen->f, 0, str); /* update immediate map */ gen->imm_regs[gen->num_imm][ch] = reg; @@ -1938,7 +1991,7 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) gen->num_imm++; - return true; + return TRUE; } @@ -1963,7 +2016,7 @@ emit_declaration(struct cell_context *cell, for (ch = 0; ch < 4; ch++) { gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); if (gen->temp_regs[i][ch] < 0) - return false; /* out of regs */ + return FALSE; /* out of regs */ } /* XXX if we run out of SPE registers, we need to spill @@ -1983,7 +2036,7 @@ emit_declaration(struct cell_context *cell, ; /* ignore */ } - return true; + return TRUE; } @@ -2019,7 +2072,7 @@ cell_gen_fragment_program(struct cell_context *cell, spe_allocate_register(f, gen.constants_reg); if (cell->debug_flags & CELL_DEBUG_ASM) { - spe_print_code(f, true); + spe_print_code(f, TRUE); spe_indent(f, 8); printf("Begin %s\n", __FUNCTION__); tgsi_dump(tokens, 0); @@ -2035,17 +2088,17 @@ cell_gen_fragment_program(struct cell_context *cell, switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) - gen.error = true; + gen.error = TRUE; break; case TGSI_TOKEN_TYPE_DECLARATION: if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) - gen.error = true; + gen.error = TRUE; break; case TGSI_TOKEN_TYPE_INSTRUCTION: if (!emit_instruction(&gen, &parse.FullToken.FullInstruction)) - gen.error = true; + gen.error = TRUE; break; default: diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c index d2235579507..6fc2257e2a3 100644 --- a/src/gallium/drivers/cell/ppu/cell_screen.c +++ b/src/gallium/drivers/cell/ppu/cell_screen.c @@ -81,8 +81,12 @@ cell_get_param(struct pipe_screen *screen, int param) return 8; /* max 128x128x128 */ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: return CELL_MAX_TEXTURE_LEVELS; + case PIPE_CAP_TEXTURE_MIRROR_REPEAT: + return 1; /* XXX not really true */ + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + return 0; /* XXX to do */ default: - return 10; + return 0; } } @@ -108,7 +112,7 @@ cell_get_paramf(struct pipe_screen *screen, int param) return 16.0; /* arbitrary */ default: - return 10; + return 0; } } diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h new file mode 100644 index 00000000000..7cbdb814d28 --- /dev/null +++ b/src/gallium/drivers/cell/spu/spu_shuffle.h @@ -0,0 +1,186 @@ +#ifndef SPU_SHUFFLE_H +#define SPU_SHUFFLE_H + +/* + * Generate shuffle patterns with minimal fuss. + * + * Based on ideas from + * http://www.insomniacgames.com/tech/articles/0408/files/shuffles.pdf + * + * A-P indicates 0-15th position in first vector + * a-p indicates 0-15th position in second vector + * + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * |00|01|02|03|04|05|06|07|08|09|0a|0b|0c|0d|0e|0f| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | A| B| C| D| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | A| B| C| D| E| F| G| H| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * | A| B| C| D| E| F| G| H| I| J| K| L| M| N| O| P| + * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ + * + * x or X indicates 0xff + * 8 indicates 0x80 + * 0 indicates 0x00 + * + * The macros SHUFFLE4() SHUFFLE8() and SHUFFLE16() provide a const vector + * unsigned char literal suitable for use with spu_shuffle(). + * + * The macros SHUFB4() SHUFB8() and SHUFB16() provide a const qword vector + * literal suitable for use with si_shufb(). + * + * + * For example : + * SHUFB4(A,A,A,A) + * expands to : + * ((const qword){0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3}) + * + * SHUFFLE8(A,B,a,b,C,c,8,8) + * expands to : + * ((const vector unsigned char){0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13, + * 0x04,0x05,0x14,0x15,0xe0,0xe0,0xe0,0xe0}) + * + */ + +#include <spu_intrinsics.h> + +#define SHUFFLE_PATTERN_4_A__ 0x00, 0x01, 0x02, 0x03 +#define SHUFFLE_PATTERN_4_B__ 0x04, 0x05, 0x06, 0x07 +#define SHUFFLE_PATTERN_4_C__ 0x08, 0x09, 0x0a, 0x0b +#define SHUFFLE_PATTERN_4_D__ 0x0c, 0x0d, 0x0e, 0x0f +#define SHUFFLE_PATTERN_4_a__ 0x10, 0x11, 0x12, 0x13 +#define SHUFFLE_PATTERN_4_b__ 0x14, 0x15, 0x16, 0x17 +#define SHUFFLE_PATTERN_4_c__ 0x18, 0x19, 0x1a, 0x1b +#define SHUFFLE_PATTERN_4_d__ 0x1c, 0x1d, 0x1e, 0x1f +#define SHUFFLE_PATTERN_4_X__ 0xc0, 0xc0, 0xc0, 0xc0 +#define SHUFFLE_PATTERN_4_x__ 0xc0, 0xc0, 0xc0, 0xc0 +#define SHUFFLE_PATTERN_4_0__ 0x80, 0x80, 0x80, 0x80 +#define SHUFFLE_PATTERN_4_8__ 0xe0, 0xe0, 0xe0, 0xe0 + +#define SHUFFLE_VECTOR_4__(A, B, C, D) \ + SHUFFLE_PATTERN_4_##A##__, \ + SHUFFLE_PATTERN_4_##B##__, \ + SHUFFLE_PATTERN_4_##C##__, \ + SHUFFLE_PATTERN_4_##D##__ + +#define SHUFFLE4(A, B, C, D) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_4__(A, B, C, D) \ + }) + +#define SHUFB4(A, B, C, D) \ + ((const qword){ \ + SHUFFLE_VECTOR_4__(A, B, C, D) \ + }) + + +#define SHUFFLE_PATTERN_8_A__ 0x00, 0x01 +#define SHUFFLE_PATTERN_8_B__ 0x02, 0x03 +#define SHUFFLE_PATTERN_8_C__ 0x04, 0x05 +#define SHUFFLE_PATTERN_8_D__ 0x06, 0x07 +#define SHUFFLE_PATTERN_8_E__ 0x08, 0x09 +#define SHUFFLE_PATTERN_8_F__ 0x0a, 0x0b +#define SHUFFLE_PATTERN_8_G__ 0x0c, 0x0d +#define SHUFFLE_PATTERN_8_H__ 0x0e, 0x0f +#define SHUFFLE_PATTERN_8_a__ 0x10, 0x11 +#define SHUFFLE_PATTERN_8_b__ 0x12, 0x13 +#define SHUFFLE_PATTERN_8_c__ 0x14, 0x15 +#define SHUFFLE_PATTERN_8_d__ 0x16, 0x17 +#define SHUFFLE_PATTERN_8_e__ 0x18, 0x19 +#define SHUFFLE_PATTERN_8_f__ 0x1a, 0x1b +#define SHUFFLE_PATTERN_8_g__ 0x1c, 0x1d +#define SHUFFLE_PATTERN_8_h__ 0x1e, 0x1f +#define SHUFFLE_PATTERN_8_X__ 0xc0, 0xc0 +#define SHUFFLE_PATTERN_8_x__ 0xc0, 0xc0 +#define SHUFFLE_PATTERN_8_0__ 0x80, 0x80 +#define SHUFFLE_PATTERN_8_8__ 0xe0, 0xe0 + + +#define SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + SHUFFLE_PATTERN_8_##A##__, \ + SHUFFLE_PATTERN_8_##B##__, \ + SHUFFLE_PATTERN_8_##C##__, \ + SHUFFLE_PATTERN_8_##D##__, \ + SHUFFLE_PATTERN_8_##E##__, \ + SHUFFLE_PATTERN_8_##F##__, \ + SHUFFLE_PATTERN_8_##G##__, \ + SHUFFLE_PATTERN_8_##H##__ + +#define SHUFFLE8(A, B, C, D, E, F, G, H) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + }) + +#define SHUFB8(A, B, C, D, E, F, G, H) \ + ((const qword){ \ + SHUFFLE_VECTOR_8__(A, B, C, D, E, F, G, H) \ + }) + + +#define SHUFFLE_PATTERN_16_A__ 0x00 +#define SHUFFLE_PATTERN_16_B__ 0x01 +#define SHUFFLE_PATTERN_16_C__ 0x02 +#define SHUFFLE_PATTERN_16_D__ 0x03 +#define SHUFFLE_PATTERN_16_E__ 0x04 +#define SHUFFLE_PATTERN_16_F__ 0x05 +#define SHUFFLE_PATTERN_16_G__ 0x06 +#define SHUFFLE_PATTERN_16_H__ 0x07 +#define SHUFFLE_PATTERN_16_I__ 0x08 +#define SHUFFLE_PATTERN_16_J__ 0x09 +#define SHUFFLE_PATTERN_16_K__ 0x0a +#define SHUFFLE_PATTERN_16_L__ 0x0b +#define SHUFFLE_PATTERN_16_M__ 0x0c +#define SHUFFLE_PATTERN_16_N__ 0x0d +#define SHUFFLE_PATTERN_16_O__ 0x0e +#define SHUFFLE_PATTERN_16_P__ 0x0f +#define SHUFFLE_PATTERN_16_a__ 0x10 +#define SHUFFLE_PATTERN_16_b__ 0x11 +#define SHUFFLE_PATTERN_16_c__ 0x12 +#define SHUFFLE_PATTERN_16_d__ 0x13 +#define SHUFFLE_PATTERN_16_e__ 0x14 +#define SHUFFLE_PATTERN_16_f__ 0x15 +#define SHUFFLE_PATTERN_16_g__ 0x16 +#define SHUFFLE_PATTERN_16_h__ 0x17 +#define SHUFFLE_PATTERN_16_i__ 0x18 +#define SHUFFLE_PATTERN_16_j__ 0x19 +#define SHUFFLE_PATTERN_16_k__ 0x1a +#define SHUFFLE_PATTERN_16_l__ 0x1b +#define SHUFFLE_PATTERN_16_m__ 0x1c +#define SHUFFLE_PATTERN_16_n__ 0x1d +#define SHUFFLE_PATTERN_16_o__ 0x1e +#define SHUFFLE_PATTERN_16_p__ 0x1f +#define SHUFFLE_PATTERN_16_X__ 0xc0 +#define SHUFFLE_PATTERN_16_x__ 0xc0 +#define SHUFFLE_PATTERN_16_0__ 0x80 +#define SHUFFLE_PATTERN_16_8__ 0xe0 + +#define SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + SHUFFLE_PATTERN_16_##A##__, \ + SHUFFLE_PATTERN_16_##B##__, \ + SHUFFLE_PATTERN_16_##C##__, \ + SHUFFLE_PATTERN_16_##D##__, \ + SHUFFLE_PATTERN_16_##E##__, \ + SHUFFLE_PATTERN_16_##F##__, \ + SHUFFLE_PATTERN_16_##G##__, \ + SHUFFLE_PATTERN_16_##H##__, \ + SHUFFLE_PATTERN_16_##I##__, \ + SHUFFLE_PATTERN_16_##J##__, \ + SHUFFLE_PATTERN_16_##K##__, \ + SHUFFLE_PATTERN_16_##L##__, \ + SHUFFLE_PATTERN_16_##M##__, \ + SHUFFLE_PATTERN_16_##N##__, \ + SHUFFLE_PATTERN_16_##O##__, \ + SHUFFLE_PATTERN_16_##P + +#define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + ((const vector unsigned char){ \ + SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + }) + +#define SHUFB16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + ((const qword){ \ + SHUFFLE_VECTOR_16__(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + }) + +#endif diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c index 22e51a86ae5..322be1252e9 100644 --- a/src/gallium/drivers/cell/spu/spu_tri.c +++ b/src/gallium/drivers/cell/spu/spu_tri.c @@ -35,6 +35,7 @@ #include "util/u_math.h" #include "spu_colorpack.h" #include "spu_main.h" +#include "spu_shuffle.h" #include "spu_texture.h" #include "spu_tile.h" #include "spu_tri.h" @@ -76,8 +77,13 @@ struct vertex_header { * Triangle edge info */ struct edge { - float dx; /**< X(v1) - X(v0), used only during setup */ - float dy; /**< Y(v1) - Y(v0), used only during setup */ + union { + struct { + float dx; /**< X(v1) - X(v0), used only during setup */ + float dy; /**< Y(v1) - Y(v0), used only during setup */ + }; + vec_float4 ds; /**< vector accessor for dx and dy */ + }; float dxdy; /**< dx/dy */ float sx, sy; /**< first sample point coord */ int lines; /**< number of lines on this edge */ @@ -102,10 +108,15 @@ struct setup_stage { * turn. Currently fixed at 4 floats, but should change in time. * Codegen will help cope with this. */ - const struct vertex_header *vmax; - const struct vertex_header *vmid; - const struct vertex_header *vmin; - const struct vertex_header *vprovoke; + union { + struct { + const struct vertex_header *vmin; + const struct vertex_header *vmid; + const struct vertex_header *vmax; + const struct vertex_header *vprovoke; + }; + qword vertex_headers; + }; struct edge ebot; struct edge etop; @@ -122,8 +133,7 @@ struct setup_stage { struct interp_coef coef[PIPE_MAX_SHADER_INPUTS]; struct { - int left[2]; /**< [0] = row0, [1] = row1 */ - int right[2]; + vec_int4 quad; /**< [0] = row0, [1] = row1; {left[0],left[1],right[0],right[1]} */ int y; unsigned y_flags; unsigned mask; /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */ @@ -306,52 +316,35 @@ block(int x) /** - * Compute mask which indicates which pixels in the 2x2 quad are actually inside - * the triangle's bounds. - * The mask is a uint4 vector and each element will be 0 or 0xffffffff. - */ -static INLINE mask_t -calculate_mask(int x) -{ - /* This is a little tricky. - * Use & instead of && to avoid branches. - * Use negation to convert true/false to ~0/0 values. - */ - mask_t mask; - mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0); - mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1); - mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2); - mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3); - return mask; -} - - -/** * Render a horizontal span of quads */ static void flush_spans(void) { int minleft, maxright; - int x; + + const int l0 = spu_extract(setup.span.quad, 0); + const int l1 = spu_extract(setup.span.quad, 1); + const int r0 = spu_extract(setup.span.quad, 2); + const int r1 = spu_extract(setup.span.quad, 3); switch (setup.span.y_flags) { case 0x3: /* both odd and even lines written (both quad rows) */ - minleft = MIN2(setup.span.left[0], setup.span.left[1]); - maxright = MAX2(setup.span.right[0], setup.span.right[1]); + minleft = MIN2(l0, l1); + maxright = MAX2(r0, r1); break; case 0x1: /* only even line written (quad top row) */ - minleft = setup.span.left[0]; - maxright = setup.span.right[0]; + minleft = l0; + maxright = r0; break; case 0x2: /* only odd line written (quad bottom row) */ - minleft = setup.span.left[1]; - maxright = setup.span.right[1]; + minleft = l1; + maxright = r1; break; default: @@ -389,17 +382,42 @@ flush_spans(void) ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED); } - /* XXX this loop could be moved into the above switch cases and - * calculate_mask() could be simplified a bit... - */ - for (x = block(minleft); x <= block(maxright); x += 2) { - emit_quad( x, setup.span.y, calculate_mask( x )); + /* XXX this loop could be moved into the above switch cases... */ + + /* Setup for mask calculation */ + const vec_int4 quad_LlRr = setup.span.quad; + const vec_int4 quad_RrLl = spu_rlqwbyte(quad_LlRr, 8); + const vec_int4 quad_LLll = spu_shuffle(quad_LlRr, quad_LlRr, SHUFFLE4(A,A,B,B)); + const vec_int4 quad_RRrr = spu_shuffle(quad_RrLl, quad_RrLl, SHUFFLE4(A,A,B,B)); + + const vec_int4 twos = spu_splats(2); + + const int x = block(minleft); + vec_int4 xs = {x, x+1, x, x+1}; + + for (; spu_extract(xs, 0) <= block(maxright); xs += twos) { + /** + * Computes mask to indicate which pixels in the 2x2 quad are actually + * inside the triangle's bounds. + */ + + /* Calculate ({x,x+1,x,x+1} >= {l[0],l[0],l[1],l[1]}) */ + const mask_t gt_LLll_xs = spu_cmpgt(quad_LLll, xs); + const mask_t gte_xs_LLll = spu_nand(gt_LLll_xs, gt_LLll_xs); + + /* Calculate ({r[0],r[0],r[1],r[1]} > {x,x+1,x,x+1}) */ + const mask_t gt_RRrr_xs = spu_cmpgt(quad_RRrr, xs); + + /* Combine results to create mask */ + const mask_t mask = spu_and(gte_xs_LLll, gt_RRrr_xs); + + emit_quad(spu_extract(xs, 0), setup.span.y, mask); } setup.span.y = 0; setup.span.y_flags = 0; - setup.span.right[0] = 0; - setup.span.right[1] = 0; + /* Zero right elements */ + setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); } @@ -444,55 +462,39 @@ setup_sort_vertices(const struct vertex_header *v0, /* determine bottom to top order of vertices */ { - float y0 = spu_extract(v0->data[0], 1); - float y1 = spu_extract(v1->data[0], 1); - float y2 = spu_extract(v2->data[0], 1); - if (y0 <= y1) { - if (y1 <= y2) { - /* y0<=y1<=y2 */ - setup.vmin = v0; - setup.vmid = v1; - setup.vmax = v2; - sign = -1.0f; - } - else if (y2 <= y0) { - /* y2<=y0<=y1 */ - setup.vmin = v2; - setup.vmid = v0; - setup.vmax = v1; - sign = -1.0f; - } - else { - /* y0<=y2<=y1 */ - setup.vmin = v0; - setup.vmid = v2; - setup.vmax = v1; - sign = 1.0f; - } - } - else { - if (y0 <= y2) { - /* y1<=y0<=y2 */ - setup.vmin = v1; - setup.vmid = v0; - setup.vmax = v2; - sign = 1.0f; - } - else if (y2 <= y1) { - /* y2<=y1<=y0 */ - setup.vmin = v2; - setup.vmid = v1; - setup.vmax = v0; - sign = 1.0f; - } - else { - /* y1<=y2<=y0 */ - setup.vmin = v1; - setup.vmid = v2; - setup.vmax = v0; - sign = -1.0f; - } - } + /* A table of shuffle patterns for putting vertex_header pointers into + correct order. Quite magical. */ + const vec_uchar16 sort_order_patterns[] = { + SHUFFLE4(A,B,C,C), + SHUFFLE4(C,A,B,C), + SHUFFLE4(A,C,B,C), + SHUFFLE4(B,C,A,C), + SHUFFLE4(B,A,C,C), + SHUFFLE4(C,B,A,C) }; + + /* The vertex_header pointers, packed for easy shuffling later */ + const vec_uint4 vs = {(unsigned)v0, (unsigned)v1, (unsigned)v2}; + + /* Collate y values into two vectors for comparison. + Using only one shuffle constant! ;) */ + const vec_float4 y_02_ = spu_shuffle(v0->data[0], v2->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_10_ = spu_shuffle(v1->data[0], v0->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_012 = spu_shuffle(y_02_, v1->data[0], SHUFFLE4(0,B,b,C)); + const vec_float4 y_120 = spu_shuffle(y_10_, v2->data[0], SHUFFLE4(0,B,b,C)); + + /* Perform comparison: {y0,y1,y2} > {y1,y2,y0} */ + const vec_uint4 compare = spu_cmpgt(y_012, y_120); + /* Compress the result of the comparison into 4 bits */ + const vec_uint4 gather = spu_gather(compare); + /* Subtract one to attain the index into the LUT. Magical. */ + const unsigned int index = spu_extract(gather, 0) - 1; + + /* Load the appropriate pattern and construct the desired vector. */ + setup.vertex_headers = (qword)spu_shuffle(vs, vs, sort_order_patterns[index]); + + /* Using the result of the comparison, set sign. + Very magical. */ + sign = ((si_to_uint(si_cntb((qword)gather)) == 2) ? 1.0f : -1.0f); } /* Check if triangle is completely outside the tile bounds */ @@ -509,12 +511,9 @@ setup_sort_vertices(const struct vertex_header *v0, spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx) return FALSE; - setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0); - setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1); - setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0); - setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1); - setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0); - setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1); + setup.ebot.ds = spu_sub(setup.vmid->data[0], setup.vmin->data[0]); + setup.emaj.ds = spu_sub(setup.vmax->data[0], setup.vmin->data[0]); + setup.etop.ds = spu_sub(setup.vmax->data[0], setup.vmid->data[0]); /* * Compute triangle's area. Use 1/area to compute partial @@ -535,8 +534,6 @@ setup_sort_vertices(const struct vertex_header *v0, setup.facing = (area * sign > 0.0f) ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW); - setup.vprovoke = v2; - return TRUE; } @@ -746,9 +743,11 @@ subtriangle(struct edge *eleft, struct edge *eright, unsigned lines) setup.span.y = block(_y); } - setup.span.left[_y&1] = left; - setup.span.right[_y&1] = right; - setup.span.y_flags |= 1<<(_y&1); + int offset = _y&1; + vec_int4 quad_LlRr = {left, left, right, right}; + /* Store left and right in 0 or 1 row of quad based on offset */ + setup.span.quad = spu_sel(quad_LlRr, setup.span.quad, spu_maskw(5<<offset)); + setup.span.y_flags |= 1<<offset; } } @@ -790,8 +789,8 @@ tri_draw(const float *v0, const float *v1, const float *v2, setup.span.y = 0; setup.span.y_flags = 0; - setup.span.right[0] = 0; - setup.span.right[1] = 0; + /* Zero right elements */ + setup.span.quad = spu_shuffle(setup.span.quad, setup.span.quad, SHUFFLE4(A,B,0,0)); if (setup.oneOverArea < 0.0) { /* emaj on left */ diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c index 4fda1ab64f5..a8e97e7c306 100644 --- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c +++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c @@ -197,9 +197,7 @@ i915_vbuf_render_set_primitive( struct vbuf_render *render, i915_render->fallback = 0; return TRUE; default: - assert((int)"Error unkown primtive type" & 0); - /* Actually, can handle a lot more just fine... Fixme. - */ + /* FIXME: Actually, can handle a lot more just fine... */ return FALSE; } } diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c index d40d75f2640..2f974cf5c40 100644 --- a/src/gallium/drivers/nv30/nv30_query.c +++ b/src/gallium/drivers/nv30/nv30_query.c @@ -50,7 +50,7 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq) * the existing query to notify completion, but it could be better. */ if (q->object) { - uint64 tmp; + uint64_t tmp; pipe->get_query_result(pipe, pq, 1, &tmp); } @@ -80,7 +80,7 @@ nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq) static boolean nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq, - boolean wait, uint64 *result) + boolean wait, uint64_t *result) { struct nv30_context *nv30 = nv30_context(pipe); struct nv30_query *q = nv30_query(pq); diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c index 40fed621b24..9480695d6e5 100644 --- a/src/gallium/drivers/nv30/nv30_state_emit.c +++ b/src/gallium/drivers/nv30/nv30_state_emit.c @@ -49,7 +49,7 @@ nv30_state_emit(struct nv30_context *nv30) struct nv30_state *state = &nv30->state; struct nv30_screen *screen = nv30->screen; unsigned i, samplers; - uint64 states; + uint64_t states; if (nv30->pctx_id != screen->cur_pctx) { for (i = 0; i < NV30_STATE_MAX; i++) { diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c index 57f39cfab0c..9b9a43f49df 100644 --- a/src/gallium/drivers/nv40/nv40_query.c +++ b/src/gallium/drivers/nv40/nv40_query.c @@ -50,7 +50,7 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq) * the existing query to notify completion, but it could be better. */ if (q->object) { - uint64 tmp; + uint64_t tmp; pipe->get_query_result(pipe, pq, 1, &tmp); } @@ -80,7 +80,7 @@ nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq) static boolean nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq, - boolean wait, uint64 *result) + boolean wait, uint64_t *result) { struct nv40_context *nv40 = nv40_context(pipe); struct nv40_query *q = nv40_query(pq); diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c index ab88dc416e5..52ec4c044b4 100644 --- a/src/gallium/drivers/nv40/nv40_state_emit.c +++ b/src/gallium/drivers/nv40/nv40_state_emit.c @@ -65,7 +65,7 @@ nv40_state_emit(struct nv40_context *nv40) struct nv40_state *state = &nv40->state; struct nv40_screen *screen = nv40->screen; unsigned i, samplers; - uint64 states; + uint64_t states; if (nv40->pctx_id != screen->cur_pctx) { for (i = 0; i < NV40_STATE_MAX; i++) { diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h index 5d377f2d067..0958bba334a 100644 --- a/src/gallium/drivers/nv50/nv50_context.h +++ b/src/gallium/drivers/nv50/nv50_context.h @@ -70,6 +70,10 @@ struct nv50_rasterizer_stateobj { struct nv50_miptree { struct pipe_texture base; struct pipe_buffer *buffer; + + int *image_offset; + int image_nr; + int total_size; }; static INLINE struct nv50_miptree * diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index 28a8bdc0fab..24973712324 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -31,7 +31,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt) { struct pipe_winsys *ws = pscreen->winsys; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); - unsigned usage, pitch; + unsigned usage, width = pt->width[0], height = pt->height[0]; + int i; mt->base = *pt; mt->base.refcount = 1; @@ -47,11 +48,31 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt) break; } - pitch = ((pt->width[0] + 63) & ~63) * pt->block.size; - /*XXX*/ - pitch *= 2; + switch (pt->target) { + case PIPE_TEXTURE_3D: + mt->image_nr = pt->depth[0]; + break; + case PIPE_TEXTURE_CUBE: + mt->image_nr = 6; + break; + default: + mt->image_nr = 1; + break; + } + mt->image_offset = CALLOC(mt->image_nr, sizeof(int)); - mt->buffer = ws->buffer_create(ws, 256, usage, pitch * pt->height[0]); + for (i = 0; i < mt->image_nr; i++) { + int image_size; + + image_size = align(width, 8) * pt->block.size; + image_size = align(image_size, 64); + image_size *= align(height, 8) * pt->block.size; + + mt->image_offset[i] = mt->total_size; + mt->total_size += image_size; + } + + mt->buffer = ws->buffer_create(ws, 256, usage, mt->total_size); if (!mt->buffer) { FREE(mt); return NULL; @@ -83,6 +104,15 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt, struct nv50_miptree *mt = nv50_miptree(pt); struct nv50_surface *s; struct pipe_surface *ps; + int img; + + if (pt->target == PIPE_TEXTURE_CUBE) + img = face; + else + if (pt->target == PIPE_TEXTURE_3D) + img = zslice; + else + img = 0; s = CALLOC_STRUCT(nv50_surface); if (!s) @@ -98,7 +128,7 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt, ps->nblocksx = pt->nblocksx[level]; ps->nblocksy = pt->nblocksy[level]; ps->stride = ps->width * ps->block.size; - ps->offset = 0; + ps->offset = mt->image_offset[img]; ps->usage = flags; ps->status = PIPE_SURFACE_STATUS_DEFINED; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index d6fbdd18243..d66e1d0949d 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -179,6 +179,38 @@ free_temp(struct nv50_pc *pc, struct nv50_reg *r) } } +static int +alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) +{ + int i; + + if ((idx + 4) >= NV50_SU_MAX_TEMP) + return 1; + + if (pc->r_temp[idx] || pc->r_temp[idx + 1] || + pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) + return alloc_temp4(pc, dst, idx + 1); + + for (i = 0; i < 4; i++) { + dst[i] = CALLOC_STRUCT(nv50_reg); + dst[i]->type = P_TEMP; + dst[i]->index = -1; + dst[i]->hw = idx + i; + pc->r_temp[idx + i] = dst[i]; + } + + return 0; +} + +static void +free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) +{ + int i; + + for (i = 0; i < 4; i++) + free_temp(pc, reg[i]); +} + static struct nv50_reg * temp_temp(struct nv50_pc *pc) { @@ -902,7 +934,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) { const struct tgsi_full_instruction *inst = &tok->FullInstruction; struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp; - unsigned mask, sat; + unsigned mask, sat, unit; int i, c; mask = inst->FullDstRegisters[0].DstRegister.WriteMask; @@ -916,8 +948,13 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; + + if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) + unit = fs->SrcRegister.Index; + for (c = 0; c < 4; c++) - src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]); + src[i][c] = tgsi_src(pc, c, fs); } if (sat) { @@ -1155,35 +1192,30 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) } break; case TGSI_OPCODE_TEX: - { - struct nv50_reg *t0, *t1, *t2, *t3; - struct nv50_program_exec *e; + case TGSI_OPCODE_TXP: + { + struct nv50_reg *t[4]; + struct nv50_program_exec *e; - t0 = alloc_temp(pc, NULL); - t0 = alloc_temp(pc, NULL); - t1 = alloc_temp(pc, NULL); - t2 = alloc_temp(pc, NULL); - t3 = alloc_temp(pc, NULL); - emit_mov(pc, t0, src[0][0]); - emit_mov(pc, t1, src[0][1]); + alloc_temp4(pc, t, 0); + emit_mov(pc, t[0], src[0][0]); + emit_mov(pc, t[1], src[0][1]); - e = exec(pc); - e->inst[0] = 0xf6400000; - set_long(pc, e); - e->inst[1] |= 0x0000c004; - set_dst(pc, t0, e); - emit(pc, e); + e = exec(pc); + e->inst[0] = 0xf6400000; + e->inst[0] |= (unit << 9); + set_long(pc, e); + e->inst[1] |= 0x0000c004; + set_dst(pc, t[0], e); + emit(pc, e); - if (mask & (1 << 0)) emit_mov(pc, dst[0], t0); - if (mask & (1 << 1)) emit_mov(pc, dst[1], t1); - if (mask & (1 << 2)) emit_mov(pc, dst[2], t2); - if (mask & (1 << 3)) emit_mov(pc, dst[3], t3); + if (mask & (1 << 0)) emit_mov(pc, dst[0], t[0]); + if (mask & (1 << 1)) emit_mov(pc, dst[1], t[1]); + if (mask & (1 << 2)) emit_mov(pc, dst[2], t[2]); + if (mask & (1 << 3)) emit_mov(pc, dst[3], t[3]); - free_temp(pc, t0); - free_temp(pc, t1); - free_temp(pc, t2); - free_temp(pc, t3); - } + free_temp4(pc, t); + } break; case TGSI_OPCODE_XPD: temp = alloc_temp(pc, NULL); @@ -1570,8 +1602,13 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) if (!upload) return; + NOUVEAU_ERR("-------\n"); up = ptr = MALLOC(p->exec_size * 4); for (e = p->exec_head; e; e = e->next) { + NOUVEAU_ERR("0x%08x\n", e->inst[0]); + if (is_long(e)) + NOUVEAU_ERR("0x%08x\n", e->inst[1]); + *(ptr++) = e->inst[0]; if (is_long(e)) *(ptr++) = e->inst[1]; @@ -1687,7 +1724,7 @@ nv50_fragprog_validate(struct nv50_context *nv50) void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) { - struct pipe_winsys *ws = nv50->pipe.winsys; + struct pipe_screen *pscreen = nv50->pipe.screen; while (p->exec_head) { struct nv50_program_exec *e = p->exec_head; @@ -1699,7 +1736,7 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) p->exec_size = 0; if (p->buffer) - pipe_buffer_reference(ws, &p->buffer, NULL); + pipe_buffer_reference(pscreen, &p->buffer, NULL); nv50->screen->nvws->res_free(&p->data); diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c index 26bd90ccc5e..777e77906d5 100644 --- a/src/gallium/drivers/nv50/nv50_query.c +++ b/src/gallium/drivers/nv50/nv50_query.c @@ -51,7 +51,7 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *q) static boolean nv50_query_result(struct pipe_context *pipe, struct pipe_query *q, - boolean wait, uint64 *result) + boolean wait, uint64_t *result) { NOUVEAU_ERR("unimplemented\n"); *result = 0xdeadcafe; diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index 7ab12a6d702..e2451c6ecb5 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -94,7 +94,7 @@ struct softpipe_context { /* Counter for occlusion queries. Note this supports overlapping * queries. */ - uint64 occlusion_count; + uint64_t occlusion_count; /* * Mapped vertex buffers diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c index 40329a95627..5dacbbe55f8 100644 --- a/src/gallium/drivers/softpipe/sp_quad_fs.c +++ b/src/gallium/drivers/softpipe/sp_quad_fs.c @@ -171,7 +171,6 @@ static void shade_destroy(struct quad_stage *qs) struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe ) { struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage); - uint i; /* allocate storage for program inputs/outputs, aligned to 16 bytes */ qss->inputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->inputs) + 16); diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c index 2106ee1d235..b0d8e01426d 100644 --- a/src/gallium/drivers/softpipe/sp_query.c +++ b/src/gallium/drivers/softpipe/sp_query.c @@ -37,8 +37,8 @@ #include "sp_query.h" struct softpipe_query { - uint64 start; - uint64 end; + uint64_t start; + uint64_t end; }; @@ -87,7 +87,7 @@ static boolean softpipe_get_query_result(struct pipe_context *pipe, struct pipe_query *q, boolean wait, - uint64 *result ) + uint64_t *result ) { struct softpipe_query *sq = softpipe_query(q); *result = sq->end - sq->start; diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 12f98c32f53..11b08b3a82d 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -55,9 +55,9 @@ softpipe_get_param(struct pipe_screen *screen, int param) { switch (param) { case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: - return 8; + return PIPE_MAX_SAMPLERS; case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS: - return 8; + return PIPE_MAX_SAMPLERS; case PIPE_CAP_NPOT_TEXTURES: return 1; case PIPE_CAP_TWO_SIDED_STENCIL: diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c index 0cb4b2f03c3..a64dc89f432 100644 --- a/src/gallium/drivers/softpipe/sp_texture.c +++ b/src/gallium/drivers/softpipe/sp_texture.c @@ -94,31 +94,50 @@ softpipe_texture_layout(struct pipe_screen *screen, return spt->buffer != NULL; } +/* Hack it up to use the old winsys->surface_alloc_storage() + * method for now: + */ static boolean softpipe_displaytarget_layout(struct pipe_screen *screen, struct softpipe_texture * spt) { struct pipe_winsys *ws = screen->winsys; - size_t tex_size; - unsigned cpp; - - switch (spt->base.format) { - case PIPE_FORMAT_R5G6B5_UNORM: - cpp = 2; - break; - case PIPE_FORMAT_Z24S8_UNORM: - case PIPE_FORMAT_A8R8G8B8_UNORM: - default: - cpp = 4; - break; + struct pipe_surface surf; + unsigned flags = (PIPE_BUFFER_USAGE_CPU_READ | + PIPE_BUFFER_USAGE_CPU_WRITE | + PIPE_BUFFER_USAGE_GPU_READ | + PIPE_BUFFER_USAGE_GPU_WRITE); + int ret; + + + memset(&surf, 0, sizeof(surf)); + + ret =ws->surface_alloc_storage( ws, + &surf, + spt->base.width[0], + spt->base.height[0], + spt->base.format, + flags, + spt->base.tex_usage); + if(ret != 0) + return FALSE; + + if (!surf.buffer) { + /* allocation failed */ + return FALSE; } - tex_size = spt->base.width[0] * cpp * spt->base.height[0]; - spt->buffer = ws->buffer_create(ws, 64, PIPE_BUFFER_USAGE_PIXEL, tex_size); + /* Now extract the goodies: */ spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]); spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]); - spt->stride[0] = spt->base.width[0] * cpp; + spt->stride[0] = surf.stride; + + /* Transfer the reference: + */ + spt->buffer = surf.buffer; + surf.buffer = NULL; + return spt->buffer != NULL; } @@ -220,10 +239,8 @@ softpipe_get_tex_surface(struct pipe_screen *screen, ps = CALLOC_STRUCT(pipe_surface); ps->refcount = 1; - ps->winsys = ws; if (ps) { assert(ps->refcount); - assert(ps->winsys); pipe_texture_reference(&ps->texture, pt); pipe_buffer_reference(screen, &ps->buffer, spt->buffer); ps->format = pt->format; diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index 1dd77193791..f0d51ad82ef 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -274,11 +274,11 @@ static INLINE boolean trace_context_get_query_result(struct pipe_context *_pipe, struct pipe_query *query, boolean wait, - uint64 *presult) + uint64_t *presult) { struct trace_context *tr_ctx = trace_context(_pipe); struct pipe_context *pipe = tr_ctx->pipe; - uint64 result; + uint64_t result; boolean _result; trace_dump_call_begin("pipe_context", "get_query_result"); |