From 57fc0dd8d5610a0a25cece53b172b0c992421db0 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 26 Nov 2015 12:26:53 -0500 Subject: freedreno/ir3: assign varying locations later Rather than assigning inloc up front, when we don't yet know if it will be unused, assign it last thing before the legalize pass. Also, realize when inputs are unused (since for frag shader's we can't rely on them being removed from ir->inputs[]). This doesn't make sense if we don't also dynamically assign the inloc's, since we could end up telling the hw the wrong # of varyings (since we currently assume that the # of varyings and max-inloc are related..) Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/a3xx/fd3_program.c | 2 +- src/gallium/drivers/freedreno/a4xx/fd4_program.c | 2 +- .../drivers/freedreno/ir3/ir3_compiler_nir.c | 52 +++++++++++----------- src/gallium/drivers/freedreno/ir3/ir3_shader.h | 10 ++++- 4 files changed, 37 insertions(+), 29 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 01daa0f6f12..736151651b2 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -264,7 +264,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen + 1, 0))); OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(fp->total_in, 4) / 4)); + A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in)); for (i = 0, j = -1; (i < 8) && (j < (int)fp->inputs_count); i++) { uint32_t reg = 0; diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index ffa53f518f3..0e861b90b12 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -326,7 +326,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in)); OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(align(s[FS].v->total_in, 4) / 4)); + A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in)); for (i = 0, j = -1; (i < 16) && (j < (int)s[FS].v->inputs_count); i++) { uint32_t reg = 0; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 156bb0be247..8617704307c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -96,9 +96,6 @@ struct ir3_compile { */ struct hash_table *block_ht; - /* for calculating input/output positions/linkages: */ - unsigned next_inloc; - /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate * so we need to use ldlv.u32 to load the varying directly: */ @@ -235,7 +232,6 @@ compile_init(struct ir3_compiler *compiler, ctx->compiler = compiler; ctx->ir = so->ir; ctx->so = so; - ctx->next_inloc = 8; ctx->def_ht = _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal); ctx->var_ht = _mesa_hash_table_create(ctx, @@ -722,11 +718,12 @@ create_input(struct ir3_block *block, unsigned n) } static struct ir3_instruction * -create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv) +create_frag_input(struct ir3_compile *ctx, bool use_ldlv) { struct ir3_block *block = ctx->block; struct ir3_instruction *instr; - struct ir3_instruction *inloc = create_immed(block, n); + /* actual inloc is assigned and fixed up later: */ + struct ir3_instruction *inloc = create_immed(block, 0); if (use_ldlv) { instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); @@ -2185,8 +2182,6 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) so->inputs[n].slot = slot; so->inputs[n].compmask = (1 << ncomp) - 1; - so->inputs[n].inloc = ctx->next_inloc; - so->inputs[n].interpolate = INTERP_QUALIFIER_NONE; so->inputs_count = MAX2(so->inputs_count, n + 1); so->inputs[n].interpolate = in->data.interpolation; @@ -2231,8 +2226,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) so->inputs[n].bary = true; - instr = create_frag_input(ctx, - so->inputs[n].inloc + i - 8, use_ldlv); + instr = create_frag_input(ctx, use_ldlv); } ctx->ir->inputs[idx] = instr; @@ -2247,7 +2241,6 @@ setup_input(struct ir3_compile *ctx, nir_variable *in) } if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) { - ctx->next_inloc += ncomp; so->total_in += ncomp; } } @@ -2471,7 +2464,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, struct ir3_compile *ctx; struct ir3 *ir; struct ir3_instruction **inputs; - unsigned i, j, actual_in; + unsigned i, j, actual_in, inloc; int ret = 0, max_bary; assert(!so->ir); @@ -2591,13 +2584,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_print(ir); } - ir3_legalize(ir, &so->has_samp, &max_bary); - - if (fd_mesa_debug & FD_DBG_OPTMSGS) { - printf("AFTER LEGALIZE:\n"); - ir3_print(ir); - } - /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num; @@ -2611,32 +2597,46 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, /* Note that some or all channels of an input may be unused: */ actual_in = 0; + inloc = 0; for (i = 0; i < so->inputs_count; i++) { unsigned j, regid = ~0, compmask = 0; so->inputs[i].ncomp = 0; + so->inputs[i].inloc = inloc + 8; for (j = 0; j < 4; j++) { struct ir3_instruction *in = inputs[(i*4) + j]; - if (in) { + if (in && !(in->flags & IR3_INSTR_UNUSED)) { compmask |= (1 << j); regid = in->regs[0]->num - j; actual_in++; so->inputs[i].ncomp++; + if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) { + /* assign inloc: */ + assert(in->regs[1]->flags & IR3_REG_IMMED); + in->regs[1]->iim_val = inloc++; + } } } + if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) + so->varying_in++; so->inputs[i].regid = regid; so->inputs[i].compmask = compmask; } - /* fragment shader always gets full vec4's even if it doesn't - * fetch all components, but vertex shader we need to update - * with the actual number of components fetch, otherwise thing - * will hang due to mismaptch between VFD_DECODE's and - * TOTALATTRTOVS + /* We need to do legalize after (for frag shader's) the "bary.f" + * offsets (inloc) have been assigned. */ + ir3_legalize(ir, &so->has_samp, &max_bary); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER LEGALIZE:\n"); + ir3_print(ir); + } + + /* Note that actual_in counts inputs that are not bary.f'd for FS: */ if (so->type == SHADER_VERTEX) so->total_in = actual_in; else - so->total_in = align(max_bary + 1, 4); + so->total_in = max_bary + 1; out: if (ret) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index 5d1cccb0daa..cf99a4c05ed 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -195,7 +195,15 @@ struct ir3_shader_variant { enum glsl_interp_qualifier interpolate; } inputs[16 + 2]; /* +POSITION +FACE */ - unsigned total_in; /* sum of inputs (scalar) */ + /* sum of input components (scalar). For frag shaders, it only counts + * the varying inputs: + */ + unsigned total_in; + + /* For frag shaders, the total number of inputs (not scalar, + * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) + */ + unsigned varying_in; /* do we have one or more texture sample instructions: */ bool has_samp; -- cgit v1.2.3