summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLuca Barbieri <[email protected]>2010-08-20 21:16:49 +0200
committerLuca Barbieri <[email protected]>2010-08-21 20:42:15 +0200
commitaf4a6eba55ad81e343db788a97f3eaa3cf184369 (patch)
tree312a9db5a6ceb2d8ef5318e6a4011d3b0b2df66b
parentcf0d15642289f0a4b8d40e61266c89fe09d6601b (diff)
nv40: add fragment program control flow
-rw-r--r--src/gallium/drivers/nvfx/nvfx_fragprog.c247
-rw-r--r--src/gallium/drivers/nvfx/nvfx_shader.h5
2 files changed, 247 insertions, 5 deletions
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index f8c4eae3873..91776371e46 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -15,6 +15,7 @@
#define MAX_CONSTS 128
#define MAX_IMM 32
+
struct nvfx_fpc {
struct nvfx_fragment_program *fp;
@@ -38,6 +39,10 @@ struct nvfx_fpc {
unsigned nr_imm;
unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */
+
+ struct util_dynarray if_stack;
+ //struct util_dynarray loop_stack;
+ struct util_dynarray label_relocs;
};
static INLINE struct nvfx_reg
@@ -233,6 +238,134 @@ nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
(d), (m), (s0), none, none)
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
+{
+ const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+ struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
+ insn.cc_update = 1;
+ nvfx_fp_emit(fpc, insn);
+
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+ NV40_FP_OP_OUT_NONE |
+ (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+ (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+ (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+ (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
+ (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
+ hw[3] = 0; /* | endif_offset */
+ util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
+}
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
+{
+ struct nvfx_label_relocation reloc;
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+ (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+ hw[3] = 0;
+ reloc.target = target;
+ reloc.location = fpc->inst_offset + 2;
+ util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+}
+
+static void
+nv40_fp_ret(struct nvfx_fpc *fpc)
+{
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+ (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+ hw[3] = 0;
+}
+
+static void
+nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
+{
+ struct nvfx_label_relocation reloc;
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
+ NV40_FP_OP_OUT_NONE |
+ (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+ (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
+ (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
+ (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
+ (count << NV40_FP_OP_REP_COUNT3_SHIFT);
+ hw[3] = 0; /* | end_offset */
+ reloc.target = target;
+ reloc.location = fpc->inst_offset + 3;
+ util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+ //util_dynarray_append(&fpc->loop_stack, unsigned, target);
+}
+
+/* warning: this only works forward, and probably only if not inside any IF */
+static void
+nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
+{
+ struct nvfx_label_relocation reloc;
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+ NV40_FP_OP_OUT_NONE |
+ (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+ (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
+ hw[3] = 0; /* | endif_offset */
+ reloc.target = target;
+ reloc.location = fpc->inst_offset + 2;
+ util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+ reloc.target = target;
+ reloc.location = fpc->inst_offset + 3;
+ util_dynarray_append(&fpc->label_relocs, struct nvfx_label_relocation, reloc);
+}
+
+static void
+nv40_fp_brk(struct nvfx_fpc *fpc)
+{
+ fpc->inst_offset = fpc->fp->insn_len;
+ grow_insns(fpc, 4);
+ uint32_t *hw = &fpc->fp->insn[fpc->inst_offset];
+ /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+ hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
+ NV40_FP_OP_OUT_NONE;
+ /* Use .xxxx swizzle so that we check only src[0].x*/
+ hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+ (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
+ hw[3] = 0;
+}
+
static INLINE struct nvfx_src
tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
{
@@ -516,9 +649,6 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
case TGSI_OPCODE_RCP:
nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
break;
- case TGSI_OPCODE_RET:
- assert(0);
- break;
case TGSI_OPCODE_RFL:
if(!nvfx->is_nv4x)
nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
@@ -604,13 +734,106 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
break;
- default:
+
+ case TGSI_OPCODE_IF:
+ // MOVRC0 R31 (TR0.xyzw), R<src>:
+ // IF (NE.xxxx) ELSE <else> END <end>
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ nv40_fp_if(fpc, src[0]);
+ break;
+
+ case TGSI_OPCODE_ELSE:
+ {
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+ uint32_t *hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+ break;
+ }
+
+ case TGSI_OPCODE_ENDIF:
+ {
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+ uint32_t *hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
+ if(!hw[2])
+ hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+ hw[3] = fpc->fp->insn_len;
+ break;
+ }
+
+ case TGSI_OPCODE_BRA:
+ /* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
+ /* no state tracker uses this, so don't implement this for now */
+ assert(0);
+ nv40_fp_bra(fpc, finst->Label.Label);
+ break;
+
+ case TGSI_OPCODE_BGNSUB:
+ case TGSI_OPCODE_ENDSUB:
+ /* nothing to do here */
+ break;
+
+ case TGSI_OPCODE_CAL:
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ nv40_fp_cal(fpc, finst->Label.Label);
+ break;
+
+ case TGSI_OPCODE_RET:
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ nv40_fp_ret(fpc);
+ break;
+
+ case TGSI_OPCODE_BGNLOOP:
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
+ nv40_fp_rep(fpc, 255, finst->Label.Label);
+ break;
+
+ case TGSI_OPCODE_ENDLOOP:
+ break;
+
+ case TGSI_OPCODE_BRK:
+ if(!nvfx->is_nv4x)
+ goto nv3x_cflow;
+ nv40_fp_brk(fpc);
+ break;
+
+ case TGSI_OPCODE_CONT:
+ {
+ static int warned = 0;
+ if(!warned) {
+ NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
+ warned = 1;
+ }
+ break;
+ }
+
+ default:
NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
return FALSE;
}
+out:
release_temps(fpc);
return TRUE;
+nv3x_cflow:
+ {
+ static int warned = 0;
+ if(!warned) {
+ NOUVEAU_ERR(
+ "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
+ "If rendering is incorrect, try to disable GLSL support in the application.\n");
+ warned = 1;
+ }
+ }
+ goto out;
}
static boolean
@@ -734,6 +957,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
{
struct tgsi_parse_context parse;
struct nvfx_fpc *fpc = NULL;
+ struct util_dynarray insns;
fpc = CALLOC(1, sizeof(struct nvfx_fpc));
if (!fpc)
@@ -748,6 +972,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
tgsi_parse_init(&parse, fp->pipe.tokens);
+ util_dynarray_init(&insns);
while (!tgsi_parse_end_of_tokens(&parse)) {
tgsi_parse_token(&parse);
@@ -756,6 +981,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
{
const struct tgsi_full_instruction *finst;
+ util_dynarray_append(&insns, unsigned, fp->insn_len);
finst = &parse.FullToken.FullInstruction;
if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
goto out_err;
@@ -765,6 +991,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
break;
}
}
+ util_dynarray_append(&insns, unsigned, fp->insn_len);
+
+ for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_label_relocation))
+ {
+ struct nvfx_label_relocation* label_reloc = (struct nvfx_label_relocation*)((char*)fpc->label_relocs.data + i);
+ fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
+ }
+ util_dynarray_fini(&insns);
if(!nvfx->is_nv4x)
fp->fp_control |= (fpc->num_regs-1)/2;
@@ -775,7 +1009,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx,
if(fp->insn)
fp->insn[fpc->inst_offset] |= 0x00000001;
- /* Append NOP + END instruction, may or may not be necessary. */
+ /* Append NOP + END instruction for branches to the end of the program */
fpc->inst_offset = fp->insn_len;
grow_insns(fpc, 4);
fp->insn[fpc->inst_offset + 0] = 0x00000001;
@@ -799,6 +1033,9 @@ out_err:
tgsi_parse_free(&parse);
if (fpc->r_temp)
FREE(fpc->r_temp);
+ util_dynarray_fini(&fpc->if_stack);
+ util_dynarray_fini(&fpc->label_relocs);
+ //util_dynarray_fini(&fpc->loop_stack);
FREE(fpc);
}
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
index fcf577ca74c..52e684aec3b 100644
--- a/src/gallium/drivers/nvfx/nvfx_shader.h
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -509,4 +509,9 @@ nvfx_src_abs(struct nvfx_src src)
return src;
}
+struct nvfx_label_relocation {
+ unsigned location;
+ unsigned target;
+};
+
#endif