diff options
author | Rob Clark <[email protected]> | 2014-10-25 15:11:59 -0400 |
---|---|---|
committer | Rob Clark <[email protected]> | 2015-01-07 19:37:28 -0500 |
commit | 9a9f2a893b5e29a77d66671191653f0b4261f546 (patch) | |
tree | cb18b6fe28568d5c1f728632b93f60e50bd2b203 /src/gallium/drivers/freedreno/ir3/ir3_compiler.c | |
parent | dddfe6c21ee92f015b78060545f08239c331ceba (diff) |
freedreno/ir3: simplify RA
Group inputs/outputs, in addition to fanin/fanout, as they must also
exist in sequential scalar registers. This lets us simplify RA by
working in terms of neighbor groups.
NOTE: has the slight problem that it can't optimize out mov's for things
like:
MOV OUT[n], IN[m]
To avoid this, instead of trying to figure out what mov's we can
eliminate, we first remove all mov's prior to grouping, and then
re-insert mov's as needed while grouping inputs/outputs/fanins.
Eventually we'd prefer the frontend to not insert extra mov's in the
first place (so we don't have to bother removing them). This is the
plan for an eventual NIR based frontend, so separate out the instr
grouping (which will still be needed for NIR frontend) from the mov
elimination (which won't).
Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno/ir3/ir3_compiler.c')
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler.c | 72 |
1 files changed, 58 insertions, 14 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c index b47aa1d14d8..209621bd013 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -571,23 +571,40 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx, } else if ((dst->File == TGSI_FILE_TEMPORARY) || (dst->File == TGSI_FILE_OUTPUT) || (dst->File == TGSI_FILE_ADDRESS)) { + struct ir3_instruction *prev = NULL; unsigned i; /* if instruction writes multiple, we need to create * some place-holder collect the registers: */ for (i = 0; i < 4; i++) { - if (wrmask & (1 << i)) { - struct ir3_instruction *collect = - ir3_instr_create(ctx->block, -1, OPC_META_FO); - collect->fo.off = i; - /* unused dst reg: */ - ir3_reg_create(collect, 0, 0); - /* and src reg used to hold original instr */ - ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; - if (!ctx->atomic) - ssa_dst(ctx, collect, dst, chan+i); + /* NOTE: slightly ugly that we setup neighbor ptrs + * for FO here, but handle FI in CP pass.. we should + * probably just always setup neighbor ptrs in the + * frontend? + */ + struct ir3_instruction *split = + ir3_instr_create(ctx->block, -1, OPC_META_FO); + split->fo.off = i; + /* unused dst reg: */ + /* NOTE: set SSA flag on dst here, because unused FO's + * which don't get scheduled will end up not in the + * instruction list when RA sets SSA flag on each dst. + * Slight hack. We really should set SSA flag on + * every dst register in the frontend. + */ + ir3_reg_create(split, 0, IR3_REG_SSA); + /* and src reg used to hold original instr */ + ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr; + if (prev) { + split->cp.left = prev; + split->cp.left_cnt++; + prev->cp.right = split; + prev->cp.right_cnt++; } + if ((wrmask & (1 << i)) && !ctx->atomic) + ssa_dst(ctx, split, dst, chan+i); + prev = split; } } @@ -3120,6 +3137,17 @@ ir3_compile_shader(struct ir3_shader_variant *so, } } + /* if we want half-precision outputs, mark the output registers + * as half: + */ + if (key.half_precision) { + for (i = 0; i < block->noutputs; i++) { + if (!block->outputs[i]) + continue; + block->outputs[i]->regs[0]->flags |= IR3_REG_HALF; + } + } + /* at this point, we want the kill's in the outputs array too, * so that they get scheduled (since they have no dst).. we've * already ensured that the array is big enough in push_block(): @@ -3145,9 +3173,26 @@ ir3_compile_shader(struct ir3_shader_variant *so, ir3_dump_instr_list(block->head); } + ir3_block_depth(block); + + /* First remove all the extra mov's (which we could skip if the + * front-end was clever enough not to insert them in the first + * place). Then figure out left/right neighbors, re-inserting + * extra mov's when needed to avoid conflicts. + */ if (cp && !(fd_mesa_debug & FD_DBG_NOCP)) ir3_block_cp(block); + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("BEFORE GROUPING:\n"); + ir3_dump_instr_list(block->head); + } + + /* Group left/right neighbors, inserting mov's where needed to + * solve conflicts: + */ + ir3_block_group(block); + if (fd_mesa_debug & FD_DBG_OPTDUMP) compile_dump(&ctx); @@ -3169,20 +3214,19 @@ ir3_compile_shader(struct ir3_shader_variant *so, ir3_dump_instr_list(block->head); } - ret = ir3_block_ra(block, so->type, key.half_precision, - so->frag_coord, so->frag_face); + ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face); if (ret) { DBG("RA failed!"); goto out; } - ir3_block_legalize(block, &so->has_samp, &max_bary); - if (fd_mesa_debug & FD_DBG_OPTMSGS) { printf("AFTER RA:\n"); ir3_dump_instr_list(block->head); } + ir3_block_legalize(block, &so->has_samp, &max_bary); + /* fixup input/outputs: */ for (i = 0; i < so->outputs_count; i++) { so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; |