diff options
author | Rob Clark <[email protected]> | 2014-01-29 17:18:49 -0500 |
---|---|---|
committer | Rob Clark <[email protected]> | 2014-02-03 18:26:53 -0500 |
commit | 554f1ac00c43f4503b923e1a129c0039468dcb82 (patch) | |
tree | 1586ab27f83c704013d9f48b96d93434cd9644cc /src/gallium/drivers/freedreno | |
parent | f0e2d7ab4615651b40e37205bed12c9ca92e84f3 (diff) |
freedreno/a3xx/compiler: new compiler
The new compiler generates a dependency graph of instructions, including
a few meta-instructions to handle PHI and preserve some extra
information needed for register assignment, etc.
The depth pass assigned a weight/depth to each node (based on sum of
instruction cycles of a given node and all it's dependent nodes), which
is used to schedule instructions. The scheduling takes into account the
minimum number of cycles/slots between dependent instructions, etc.
Which was something that could not be handled properly with the original
compiler (which was more of a naive TGSI translator than an actual
compiler).
The register assignment is currently split out as a standalone pass. I
expect that it will be replaced at some point, once I figure out what to
do about relative addressing (which is currently the only thing that
should cause fallback to old compiler).
There are a couple new debug options for FD_MESA_DEBUG env var:
optmsgs - enable debug prints in optimizer
optdump - dump instruction graph in .dot format, for example:
http://people.freedesktop.org/~robclark/a3xx/frag-0000.dot.png
http://people.freedesktop.org/~robclark/a3xx/frag-0000.dot
At this point, thanks to proper handling of instruction scheduling, the
new compiler fixes a lot of things that were broken before, and does not
appear to break anything that was working before[1]. So even though it
is not finished, it seems useful to merge it in it's current state.
[1] Not merged in this commit, because I'm not sure if it really belongs
in mesa tree, but the following commit implements a simple shader
emulator, which I've used to compare the output of the new compiler to
the original compiler (ie. run it on all the TGSI shaders dumped out via
ST_DEBUG=tgsi with various games/apps):
https://github.com/freedreno/mesa/commit/163b6306b1660e05ece2f00d264a8393d99b6f12
Signed-off-by: Rob Clark <[email protected]>
Diffstat (limited to 'src/gallium/drivers/freedreno')
17 files changed, 2777 insertions, 209 deletions
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index bd5d92f1afc..05a12fb318c 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -43,4 +43,10 @@ a3xx_SOURCES := \ a3xx/fd3_util.c \ a3xx/fd3_zsa.c \ a3xx/disasm-a3xx.c \ + a3xx/ir3_cp.c \ + a3xx/ir3_depth.c \ + a3xx/ir3_dump.c \ + a3xx/ir3_flatten.c \ + a3xx/ir3_ra.c \ + a3xx/ir3_sched.c \ a3xx/ir3.c diff --git a/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c b/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c index 0e45ec54b38..8c3704bf658 100644 --- a/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c +++ b/src/gallium/drivers/freedreno/a3xx/disasm-a3xx.c @@ -735,6 +735,14 @@ struct opc_info { #define GETINFO(instr) (&(opcs[((instr)->opc_cat << NOPC_BITS) | instr_opc(instr)])) +// XXX hack.. probably should move this table somewhere common: +#include "ir3.h" +const char *ir3_instr_name(struct ir3_instruction *instr) +{ + if (instr->category == -1) return "??meta??"; + return opcs[(instr->category << NOPC_BITS) | instr->opc].name; +} + static void print_instr(uint32_t *dwords, int level, int n) { instr_t *instr = (instr_t *)dwords; diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index 5ab34e557b9..da327c97350 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -50,36 +50,43 @@ struct fd3_compile_context { const struct tgsi_token *tokens; struct ir3_shader *ir; - struct ir3_block *block; struct fd3_shader_stateobj *so; + struct ir3_block *block; + struct ir3_instruction *current_instr; + + /* we need to defer updates to block->outputs[] until the end + * of an instruction (so we don't see new value until *after* + * the src registers are processed) + */ + struct { + struct ir3_instruction *instr, **instrp; + } output_updates[16]; + unsigned num_output_updates; + + /* are we in a sequence of "atomic" instructions? + */ + bool atomic; + + /* For fragment shaders, from the hw perspective the only + * actual input is r0.xy position register passed to bary.f. + * But TGSI doesn't know that, it still declares things as + * IN[] registers. So we do all the input tracking normally + * and fix things up after compile_instructions() + */ + struct ir3_instruction *frag_pos; + struct tgsi_parse_context parser; unsigned type; struct tgsi_shader_info info; - /* last input dst (for setting (ei) flag): */ - struct ir3_register *last_input; - - /* last instruction with relative addressing: */ - struct ir3_instruction *last_rel; - /* for calculating input/output positions/linkages: */ unsigned next_inloc; unsigned num_internal_temps; struct tgsi_src_register internal_temps[6]; - /* track registers which need to synchronize w/ "complex alu" cat3 - * instruction pipeline: - */ - regmask_t needs_ss; - - /* track registers which need to synchronize with texture fetch - * pipeline: - */ - regmask_t needs_sy; - /* inputs start at r0, temporaries start after last input, and * outputs start after last temporary. * @@ -93,10 +100,8 @@ struct fd3_compile_context { /* idx/slot for last compiler generated immediate */ unsigned immediate_idx; - /* stack of branch instructions that start (potentially nested) - * branch instructions, so that we can fix up the branch targets - * so that we can fix up the branch target on the corresponding - * END instruction + /* stack of branch instructions that mark (potentially nested) + * branch if/else/loop/etc */ struct ir3_instruction *branch[16]; unsigned int branch_count; @@ -115,6 +120,7 @@ static void vectorize(struct fd3_compile_context *ctx, int nsrcs, ...); static void create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst, struct tgsi_src_register *src); +static type_t get_ftype(struct fd3_compile_context *ctx); static unsigned compile_init(struct fd3_compile_context *ctx, struct fd3_shader_stateobj *so, @@ -125,27 +131,32 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_stateobj *so, ctx->tokens = tokens; ctx->ir = so->ir; - ctx->block = ir3_block_create(ctx->ir, 0, 0, 0); ctx->so = so; - ctx->last_input = NULL; - ctx->last_rel = NULL; ctx->next_inloc = 8; ctx->num_internal_temps = 0; ctx->branch_count = 0; + ctx->block = NULL; + ctx->current_instr = NULL; + ctx->num_output_updates = 0; + ctx->atomic = false; - regmask_init(&ctx->needs_ss); - regmask_init(&ctx->needs_sy); memset(ctx->base_reg, 0, sizeof(ctx->base_reg)); tgsi_scan_shader(tokens, &ctx->info); +#define FM(x) (1 << TGSI_FILE_##x) + /* optimize can't deal with relative addressing: */ + if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | + FM(OUTPUT) | FM(IMMEDIATE) | FM(CONSTANT))) + return TGSI_PARSE_ERROR; + /* Immediates go after constants: */ ctx->base_reg[TGSI_FILE_CONSTANT] = 0; ctx->base_reg[TGSI_FILE_IMMEDIATE] = info->file_max[TGSI_FILE_CONSTANT] + 1; /* if full precision and fragment shader, don't clobber - * r0.x w/ bary fetch: + * r0.xy w/ bary fetch: */ if ((so->type == SHADER_FRAGMENT) && !so->half_precision) base = 1; @@ -202,51 +213,269 @@ struct instr_translater { }; static void -handle_last_rel(struct fd3_compile_context *ctx) +instr_finish(struct fd3_compile_context *ctx) { - if (ctx->last_rel) { - ctx->last_rel->flags |= IR3_INSTR_UL; - ctx->last_rel = NULL; - } + unsigned i; + + if (ctx->atomic) + return; + + for (i = 0; i < ctx->num_output_updates; i++) + *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr; + + ctx->num_output_updates = 0; +} + +/* For "atomic" groups of instructions, for example the four scalar + * instructions to perform a vec4 operation. Basically this just + * blocks out handling of output_updates so the next scalar instruction + * still sees the result from before the start of the atomic group. + * + * NOTE: when used properly, this could probably replace get/put_dst() + * stuff. + */ +static void +instr_atomic_start(struct fd3_compile_context *ctx) +{ + ctx->atomic = true; +} + +static void +instr_atomic_end(struct fd3_compile_context *ctx) +{ + ctx->atomic = false; + instr_finish(ctx); } static struct ir3_instruction * instr_create(struct fd3_compile_context *ctx, int category, opc_t opc) { - return ir3_instr_create(ctx->block, category, opc); + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc)); +} + +static struct ir3_instruction * +instr_clone(struct fd3_compile_context *ctx, struct ir3_instruction *instr) +{ + instr_finish(ctx); + return (ctx->current_instr = ir3_instr_clone(instr)); +} + +static struct ir3_block * +push_block(struct fd3_compile_context *ctx) +{ + struct ir3_block *block; + unsigned ntmp, nin, nout; + +#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1)) + + /* hmm, give ourselves room to create 4 extra temporaries (vec4): + */ + ntmp = SCALAR_REGS(TEMPORARY); + ntmp += 4 * 4; + + /* for outermost block, 'inputs' are the actual shader INPUT + * register file. Reads from INPUT registers always go back to + * top block. For nested blocks, 'inputs' is used to track any + * TEMPORARY file register from one of the enclosing blocks that + * is ready in this block. + */ + if (!ctx->block) { + /* NOTE: fragment shaders actually have two inputs (r0.xy, the + * position) + */ + nin = SCALAR_REGS(INPUT); + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) + nin = MAX2(2, nin); + } else { + nin = ntmp; + } + + nout = SCALAR_REGS(OUTPUT); + + block = ir3_block_create(ctx->ir, ntmp, nin, nout); + + block->parent = ctx->block; + ctx->block = block; + + return block; } static void -add_nop(struct fd3_compile_context *ctx, unsigned count) +pop_block(struct fd3_compile_context *ctx) { - while (count-- > 0) - instr_create(ctx, 0, OPC_NOP); + ctx->block = ctx->block->parent; + compile_assert(ctx, ctx->block); } -static unsigned -src_flags(struct fd3_compile_context *ctx, struct ir3_register *reg) +static void +ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) { - unsigned flags = 0; + unsigned n = regid(dst->Index, chan); + unsigned idx = ctx->num_output_updates; - if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) - return flags; + compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates)); - if (regmask_get(&ctx->needs_ss, reg)) { - flags |= IR3_INSTR_SS; - regmask_init(&ctx->needs_ss); + /* NOTE: defer update of temporaries[idx] or output[idx] + * until instr_finish(), so that if the current instruction + * reads the same TEMP/OUT[] it gets the old value: + * + * bleh.. this might be a bit easier to just figure out + * in instr_finish(). But at that point we've already + * lost information about OUTPUT vs TEMPORARY register + * file.. + */ + + switch (dst->File) { + case TGSI_FILE_OUTPUT: + compile_assert(ctx, n < ctx->block->noutputs); + ctx->output_updates[idx].instrp = &ctx->block->outputs[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; + case TGSI_FILE_TEMPORARY: + compile_assert(ctx, n < ctx->block->ntemporaries); + ctx->output_updates[idx].instrp = &ctx->block->temporaries[n]; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; } +} + +static struct ir3_instruction * +create_output(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *out; - if (regmask_get(&ctx->needs_sy, reg)) { - flags |= IR3_INSTR_SY; - regmask_init(&ctx->needs_sy); + out = ir3_instr_create(block, -1, OPC_META_OUTPUT); + out->inout.block = block; + ir3_reg_create(out, n, 0); + if (instr) + ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr; + + return out; +} + +static struct ir3_instruction * +create_input(struct ir3_block *block, struct ir3_instruction *instr, + unsigned n) +{ + struct ir3_instruction *in; + + in = ir3_instr_create(block, -1, OPC_META_INPUT); + in->inout.block = block; + ir3_reg_create(in, n, 0); + if (instr) + ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; + + return in; +} + +static struct ir3_instruction * +block_input(struct ir3_block *block, unsigned n) +{ + /* references to INPUT register file always go back up to + * top level: + */ + if (block->parent) + return block_input(block->parent, n); + return block->inputs[n]; +} + +/* return temporary in scope, creating if needed meta-input node + * to track block inputs + */ +static struct ir3_instruction * +block_temporary(struct ir3_block *block, unsigned n) +{ + /* references to TEMPORARY register file, find the nearest + * enclosing block which has already assigned this temporary, + * creating meta-input instructions along the way to keep + * track of block inputs + */ + if (block->parent && !block->temporaries[n]) { + /* if already have input for this block, reuse: */ + if (!block->inputs[n]) + block->inputs[n] = block_temporary(block->parent, n); + + /* and create new input to return: */ + return create_input(block, block->inputs[n], n); } + return block->temporaries[n]; +} - return flags; +static struct ir3_instruction * +create_immed(struct fd3_compile_context *ctx, float val) +{ + /* this can happen when registers (or components of a TGSI + * register) are used as src before they have been assigned + * (undefined contents). To avoid confusing the rest of the + * compiler, and to generally keep things peachy, substitute + * an instruction that sets the src to 0.0. Or to keep + * things undefined, I could plug in a random number? :-P + * + * NOTE: *don't* use instr_create() here! + */ + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, 1, 0); + instr->cat1.src_type = get_ftype(ctx); + instr->cat1.dst_type = get_ftype(ctx); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val; + return instr; +} + +static void +ssa_src(struct fd3_compile_context *ctx, struct ir3_register *reg, + const struct tgsi_src_register *src, unsigned chan) +{ + struct ir3_block *block = ctx->block; + unsigned n = regid(src->Index, chan); + + switch (src->File) { + case TGSI_FILE_INPUT: + reg->flags |= IR3_REG_SSA; + reg->instr = block_input(ctx->block, n); + break; + case TGSI_FILE_OUTPUT: + /* really this should just happen in case of 'MOV_SAT OUT[n], ..', + * for the following clamp instructions: + */ + reg->flags |= IR3_REG_SSA; + reg->instr = block->outputs[n]; + /* we don't have to worry about read from an OUTPUT that was + * assigned outside of the current block, because the _SAT + * clamp instructions will always be in the same block as + * the original instruction which wrote the OUTPUT + */ + compile_assert(ctx, reg->instr); + break; + case TGSI_FILE_TEMPORARY: + reg->flags |= IR3_REG_SSA; + reg->instr = block_temporary(ctx->block, n); + break; + } + + if ((reg->flags & IR3_REG_SSA) && !reg->instr) { + /* this can happen when registers (or components of a TGSI + * register) are used as src before they have been assigned + * (undefined contents). To avoid confusing the rest of the + * compiler, and to generally keep things peachy, substitute + * an instruction that sets the src to 0.0. Or to keep + * things undefined, I could plug in a random number? :-P + * + * NOTE: *don't* use instr_create() here! + */ + reg->instr = create_immed(ctx, 0.0); + } } static struct ir3_register * -add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_dst_register *dst, unsigned chan) +add_dst_reg_wrmask(struct fd3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_dst_register *dst, + unsigned chan, unsigned wrmask) { unsigned flags = 0, num = 0; struct ir3_register *reg; @@ -272,15 +501,55 @@ add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, reg = ir3_reg_create(instr, regid(num, chan), flags); - if (dst->Indirect) - ctx->last_rel = instr; + /* NOTE: do not call ssa_dst() if atomic.. vectorize() + * itself will call ssa_dst(). This is to filter out + * the (initially bogus) .x component dst which is + * created (but not necessarily used, ie. if the net + * result of the vector operation does not write to + * the .x component) + */ + + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + if (!ctx->atomic) + ssa_dst(ctx, instr, dst, chan); + } else if ((dst->File == TGSI_FILE_TEMPORARY) || + (dst->File == TGSI_FILE_OUTPUT)) { + unsigned i; + + /* if instruction writes multiple, we need to create + * some place-holder collect the registers: + */ + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + struct ir3_instruction *collect = + ir3_instr_create(ctx->block, -1, OPC_META_FO); + collect->fo.off = i; + /* unused dst reg: */ + ir3_reg_create(collect, 0, 0); + /* and src reg used to hold original instr */ + ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr; + if (!ctx->atomic) + ssa_dst(ctx, collect, dst, chan+i); + } + } + } return reg; } static struct ir3_register * -add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, - const struct tgsi_src_register *src, unsigned chan) +add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_dst_register *dst, unsigned chan) +{ + return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1); +} + +static struct ir3_register * +add_src_reg_wrmask(struct fd3_compile_context *ctx, + struct ir3_instruction *instr, const struct tgsi_src_register *src, + unsigned chan, unsigned wrmask) { unsigned flags = 0, num = 0; struct ir3_register *reg; @@ -325,14 +594,49 @@ add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, reg = ir3_reg_create(instr, regid(num, chan), flags); - if (src->Indirect) - ctx->last_rel = instr; + reg->wrmask = wrmask; + if (wrmask == 0x1) { + /* normal case */ + ssa_src(ctx, reg, src, chan); + } else if ((src->File == TGSI_FILE_TEMPORARY) || + (src->File == TGSI_FILE_OUTPUT) || + (src->File == TGSI_FILE_INPUT)) { + struct ir3_instruction *collect; + unsigned i; + + /* if instruction reads multiple, we need to create + * some place-holder collect the registers: + */ + collect = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(collect, 0, 0); /* unused dst reg */ + + for (i = 0; i < 4; i++) { + if (wrmask & (1 << i)) { + /* and src reg used point to the original instr */ + ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), + src, chan + i); + } else if (wrmask & ~((i << i) - 1)) { + /* if any remaining components, then dummy + * placeholder src reg to fill in the blanks: + */ + ir3_reg_create(collect, 0, 0); + } + } - instr->flags |= src_flags(ctx, reg); + reg->flags |= IR3_REG_SSA; + reg->instr = collect; + } return reg; } +static struct ir3_register * +add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr, + const struct tgsi_src_register *src, unsigned chan) +{ + return add_src_reg_wrmask(ctx, instr, src, chan, 0x1); +} + static void src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst) { @@ -542,8 +846,6 @@ create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst, add_dst_reg(ctx, instr, dst, i); add_src_reg(ctx, instr, src, src_swiz(src, i)); - } else { - add_nop(ctx, 1); } } } @@ -620,7 +922,8 @@ vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr, { va_list ap; int i, j, n = 0; - bool indirect = dst->Indirect; + + instr_atomic_start(ctx); add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X); @@ -636,7 +939,6 @@ vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr, reg->iim_val = *(int *)&src; } else { reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X); - indirect |= src->Indirect; } reg->flags |= flags & ~IR3_REG_NEGATE; if (flags & IR3_REG_NEGATE) @@ -651,37 +953,32 @@ vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr, if (n++ == 0) { cur = instr; } else { - cur = ir3_instr_clone(instr); - cur->flags &= ~(IR3_INSTR_SY | IR3_INSTR_SS | IR3_INSTR_JP); + cur = instr_clone(ctx, instr); } + ssa_dst(ctx, cur, dst, i); + /* fix-up dst register component: */ cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i); /* fix-up src register component: */ va_start(ap, nsrcs); for (j = 0; j < nsrcs; j++) { + struct ir3_register *reg = cur->regs[j+1]; struct tgsi_src_register *src = va_arg(ap, struct tgsi_src_register *); unsigned flags = va_arg(ap, unsigned); - if (!(flags & IR3_REG_IMMED)) { - cur->regs[j+1]->num = - regid(cur->regs[j+1]->num >> 2, - src_swiz(src, i)); - cur->flags |= src_flags(ctx, cur->regs[j+1]); + if (reg->flags & IR3_REG_SSA) { + ssa_src(ctx, reg, src, src_swiz(src, i)); + } else if (!(flags & IR3_REG_IMMED)) { + reg->num = regid(reg->num >> 2, src_swiz(src, i)); } } va_end(ap); - - if (indirect) - ctx->last_rel = cur; } } - /* pad w/ nop's.. at least until we are clever enough to - * figure out if we really need to.. - */ - add_nop(ctx, 4 - n); + instr_atomic_end(ctx); } /* @@ -718,8 +1015,6 @@ trans_arl(const struct instr_translater *t, unsigned chan = src->SwizzleX; compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - handle_last_rel(ctx); - tmp_src = get_internal_temp_hr(ctx, &tmp_dst); /* cov.{f32,f16}s16 Rtmp, Rsrc */ @@ -729,25 +1024,18 @@ trans_arl(const struct instr_translater *t, add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; add_src_reg(ctx, instr, src, chan); - add_nop(ctx, 3); - /* shl.b Rtmp, Rtmp, 2 */ instr = instr_create(ctx, 2, OPC_SHL_B); add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF; add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2; - add_nop(ctx, 3); - /* mova a0, Rtmp */ instr = instr_create(ctx, 1, 0); instr->cat1.src_type = TYPE_S16; instr->cat1.dst_type = TYPE_S16; add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF; add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF; - - /* need to ensure 5 instr slots before a0 is used: */ - add_nop(ctx, 6); } /* texture fetch/sample instructions: */ @@ -756,7 +1044,6 @@ trans_samp(const struct instr_translater *t, struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst) { - struct ir3_register *r; struct ir3_instruction *instr; struct tgsi_src_register *coord = &inst->Src[0].Register; struct tgsi_src_register *samp = &inst->Src[1].Register; @@ -790,10 +1077,8 @@ trans_samp(const struct instr_translater *t, break; } - if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) { - add_nop(ctx, 3); + if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE)) flags |= IR3_INSTR_3D; - } /* cat5 instruction cannot seem to handle const or relative: */ if (is_rel_or_const(coord)) @@ -829,8 +1114,6 @@ trans_samp(const struct instr_translater *t, } coord = tmp_src; - - add_nop(ctx, 4 - j); } instr = instr_create(ctx, 5, t->opc); @@ -839,13 +1122,10 @@ trans_samp(const struct instr_translater *t, instr->cat5.tex = samp->Index; instr->flags |= flags; - r = add_dst_reg(ctx, instr, &inst->Dst[0].Register, 0); - r->wrmask = inst->Dst[0].Register.WriteMask; + add_dst_reg_wrmask(ctx, instr, &inst->Dst[0].Register, 0, + inst->Dst[0].Register.WriteMask); - add_src_reg(ctx, instr, coord, coord->SwizzleX)->wrmask = src_wrmask; - - /* after add_src_reg() so we don't set (sy) on sam instr itself! */ - regmask_set(&ctx->needs_sy, r); + add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, src_wrmask); } /* @@ -981,45 +1261,18 @@ trans_cmp(const struct instr_translater *t, * Conditional / Flow control */ -static unsigned -find_instruction(struct fd3_compile_context *ctx, struct ir3_instruction *instr) -{ - unsigned i; - for (i = 0; i < ctx->ir->instrs_count; i++) - if (ctx->ir->instrs[i] == instr) - return i; - return ~0; -} - static void push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr) { ctx->branch[ctx->branch_count++] = instr; } -static void +static struct ir3_instruction * pop_branch(struct fd3_compile_context *ctx) { - struct ir3_instruction *instr; - - /* if we were clever enough, we'd patch this up after the fact, - * and set (jp) flag on whatever the next instruction was, rather - * than inserting an extra nop.. - */ - instr = instr_create(ctx, 0, OPC_NOP); - instr->flags |= IR3_INSTR_JP; - - /* pop the branch instruction from the stack and fix up branch target: */ - instr = ctx->branch[--ctx->branch_count]; - instr->cat0.immed = ctx->ir->instrs_count - find_instruction(ctx, instr) - 1; + return ctx->branch[--ctx->branch_count]; } -/* We probably don't really want to translate if/else/endif into branches.. - * the blob driver evaluates both legs of the if and then uses the sel - * instruction to pick which sides of the branch to "keep".. but figuring - * that out will take somewhat more compiler smarts. So hopefully branches - * don't kill performance too badly. - */ static void trans_if(const struct instr_translater *t, struct fd3_compile_context *ctx, @@ -1027,21 +1280,36 @@ trans_if(const struct instr_translater *t, { struct ir3_instruction *instr; struct tgsi_src_register *src = &inst->Src[0].Register; + struct tgsi_dst_register tmp_dst; + struct tgsi_src_register *tmp_src; struct tgsi_src_register constval; get_immediate(ctx, &constval, fui(0.0)); + tmp_src = get_internal_temp(ctx, &tmp_dst); if (is_const(src)) src = get_unconst(ctx, src); + /* cmps.f.eq tmp0, b, {0.0} */ instr = instr_create(ctx, 2, OPC_CMPS_F); - ir3_reg_create(instr, regid(REG_P0, 0), 0); + add_dst_reg(ctx, instr, &tmp_dst, 0); add_src_reg(ctx, instr, src, src->SwizzleX); add_src_reg(ctx, instr, &constval, constval.SwizzleX); instr->cat2.condition = IR3_COND_EQ; - instr = instr_create(ctx, 0, OPC_BR); + /* add.s tmp0, tmp0, -1 */ + instr = instr_create(ctx, 2, OPC_ADD_S); + add_dst_reg(ctx, instr, &tmp_dst, TGSI_SWIZZLE_X); + add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); + ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -1; + + /* meta:flow tmp0 */ + instr = instr_create(ctx, -1, OPC_META_FLOW); + ir3_reg_create(instr, 0, 0); /* dummy dst */ + add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X); + push_branch(ctx, instr); + instr->flow.if_block = push_block(ctx); } static void @@ -1051,11 +1319,63 @@ trans_else(const struct instr_translater *t, { struct ir3_instruction *instr; - /* for first half of if/else/endif, generate a jump past the else: */ - instr = instr_create(ctx, 0, OPC_JUMP); + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); - pop_branch(ctx); push_branch(ctx, instr); + instr->flow.else_block = push_block(ctx); +} + +static struct ir3_instruction * +find_temporary(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->temporaries[n]) + return find_temporary(block->parent, n); + return block->temporaries[n]; +} + +static struct ir3_instruction * +find_output(struct ir3_block *block, unsigned n) +{ + if (block->parent && !block->outputs[n]) + return find_output(block->parent, n); + return block->outputs[n]; +} + +static struct ir3_instruction * +create_phi(struct fd3_compile_context *ctx, struct ir3_instruction *cond, + struct ir3_instruction *a, struct ir3_instruction *b) +{ + struct ir3_instruction *phi; + + compile_assert(ctx, cond); + + /* Either side of the condition could be null.. which + * indicates a variable written on only one side of the + * branch. Normally this should only be variables not + * used outside of that side of the branch. So we could + * just 'return a ? a : b;' in that case. But for better + * defined undefined behavior we just stick in imm{0.0}. + * In the common case of a value only used within the + * one side of the branch, the PHI instruction will not + * get scheduled + */ + if (!a) + a = create_immed(ctx, 0.0); + if (!b) + b = create_immed(ctx, 0.0); + + phi = instr_create(ctx, -1, OPC_META_PHI); + ir3_reg_create(phi, 0, 0); /* dummy dst */ + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a; + ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b; + + return phi; } static void @@ -1063,7 +1383,120 @@ trans_endif(const struct instr_translater *t, struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst) { - pop_branch(ctx); + struct ir3_instruction *instr; + struct ir3_block *ifb, *elseb; + struct ir3_instruction **ifout, **elseout; + unsigned i, ifnout = 0, elsenout = 0; + + pop_block(ctx); + + instr = pop_branch(ctx); + + compile_assert(ctx, (instr->category == -1) && + (instr->opc == OPC_META_FLOW)); + + ifb = instr->flow.if_block; + elseb = instr->flow.else_block; + /* if there is no else block, the parent block is used for the + * branch-not-taken src of the PHI instructions: + */ + if (!elseb) + elseb = ifb->parent; + + /* count up number of outputs for each block: */ + for (i = 0; i < ifb->ntemporaries; i++) { + if (ifb->temporaries[i]) + ifnout++; + if (elseb->temporaries[i]) + elsenout++; + } + for (i = 0; i < ifb->noutputs; i++) { + if (ifb->outputs[i]) + ifnout++; + if (elseb->outputs[i]) + elsenout++; + } + + ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout); + if (elseb != ifb->parent) + elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout); + + ifnout = 0; + elsenout = 0; + + /* generate PHI instructions for any temporaries written: */ + for (i = 0; i < ifb->ntemporaries; i++) { + struct ir3_instruction *a = ifb->temporaries[i]; + struct ir3_instruction *b = elseb->temporaries[i]; + + /* if temporary written in if-block, or if else block + * is present and temporary written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_temporary(ifb, i); + if (!b) + b = find_temporary(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->temporaries[i] = phi; + } + } + + /* .. and any outputs written: */ + for (i = 0; i < ifb->noutputs; i++) { + struct ir3_instruction *a = ifb->outputs[i]; + struct ir3_instruction *b = elseb->outputs[i]; + + /* if output written in if-block, or if else block + * is present and output written in else-block: + */ + if (a || ((elseb != ifb->parent) && b)) { + struct ir3_instruction *phi; + + /* if only written on one side, find the closest + * enclosing update on other side: + */ + if (!a) + a = find_output(ifb, i); + if (!b) + b = find_output(elseb, i); + + ifout[ifnout] = a; + a = create_output(ifb, a, ifnout++); + + if (elseb != ifb->parent) { + elseout[elsenout] = b; + b = create_output(elseb, b, elsenout++); + } + + phi = create_phi(ctx, instr, a, b); + ctx->block->outputs[i] = phi; + } + } + + ifb->noutputs = ifnout; + ifb->outputs = ifout; + + if (elseb != ifb->parent) { + elseb->noutputs = elsenout; + elseb->outputs = elseout; + } + + // TODO maybe we want to compact block->inputs? } /* @@ -1161,23 +1594,6 @@ instr_cat2(const struct instr_translater *t, put_dst(ctx, inst, dst); } -static bool is_mad(opc_t opc) -{ - switch (opc) { - case OPC_MAD_U16: - case OPC_MADSH_U16: - case OPC_MAD_S16: - case OPC_MADSH_M16: - case OPC_MAD_U24: - case OPC_MAD_S24: - case OPC_MAD_F16: - case OPC_MAD_F32: - return true; - default: - return false; - } -} - static void instr_cat3(const struct instr_translater *t, struct fd3_compile_context *ctx, @@ -1217,27 +1633,21 @@ instr_cat4(const struct instr_translater *t, struct tgsi_dst_register *dst = get_dst(ctx, inst); struct tgsi_src_register *src = &inst->Src[0].Register; struct ir3_instruction *instr; - unsigned i, n; + unsigned i; /* seems like blob compiler avoids const as src.. */ if (is_const(src)) src = get_unconst(ctx, src); - /* worst case: */ - add_nop(ctx, 6); - /* we need to replicate into each component: */ - for (i = 0, n = 0; i < 4; i++) { + for (i = 0; i < 4; i++) { if (dst->WriteMask & (1 << i)) { - if (n++) - add_nop(ctx, 1); instr = instr_create(ctx, 4, t->opc); add_dst_reg(ctx, instr, dst, i); add_src_reg(ctx, instr, src, src->SwizzleX); } } - regmask_set(&ctx->needs_ss, instr->regs[0]); put_dst(ctx, inst, dst); } @@ -1287,13 +1697,12 @@ decl_semantic(const struct tgsi_declaration_semantic *sem) return fd3_semantic_name(sem->Name, sem->Index); } -static int +static void decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) { struct fd3_shader_stateobj *so = ctx->so; unsigned base = ctx->base_reg[TGSI_FILE_INPUT]; unsigned i, flags = 0; - int nop = 0; /* I don't think we should get frag shader input without * semantic info? Otherwise how do inputs get linked to @@ -1308,12 +1717,12 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) for (i = decl->Range.First; i <= decl->Range.Last; i++) { unsigned n = so->inputs_count++; unsigned r = regid(i + base, 0); - unsigned ncomp; + unsigned ncomp, j; /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */ ncomp = 4; - DBG("decl in -> r%d", i + base); // XXX + DBG("decl in -> r%d", i + base); so->inputs[n].semantic = decl_semantic(&decl->Semantic); so->inputs[n].compmask = (1 << ncomp) - 1; @@ -1323,33 +1732,33 @@ decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) so->total_in += ncomp; - /* for frag shaders, we need to generate the corresponding bary instr: */ - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - unsigned j; + for (j = 0; j < ncomp; j++) { + struct ir3_instruction *instr; - for (j = 0; j < ncomp; j++) { - struct ir3_instruction *instr; - struct ir3_register *dst; + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_register *src; instr = instr_create(ctx, 2, OPC_BARY_F); /* dst register: */ - dst = ir3_reg_create(instr, r + j, flags); - ctx->last_input = dst; + ir3_reg_create(instr, r + j, flags); /* input position: */ ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = so->inputs[n].inloc + j - 8; /* input base (always r0.xy): */ - ir3_reg_create(instr, regid(0,0), 0)->wrmask = 0x3; + src = ir3_reg_create(instr, regid(0,0), IR3_REG_SSA); + src->wrmask = 0x3; + src->instr = ctx->frag_pos; + + } else { + instr = create_input(ctx->block, NULL, (i * 4) + j); } - nop = 6; + ctx->block->inputs[(i * 4) + j] = instr; } } - - return nop; } static void @@ -1361,9 +1770,9 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) unsigned name = decl->Semantic.Name; unsigned i; - compile_assert(ctx, decl->Declaration.Semantic); // TODO is this ever not true? + compile_assert(ctx, decl->Declaration.Semantic); - DBG("decl out[%d] -> r%d", name, decl->Range.First + base); // XXX + DBG("decl out[%d] -> r%d", name, decl->Range.First + base); if (ctx->type == TGSI_PROCESSOR_VERTEX) { switch (name) { @@ -1396,8 +1805,19 @@ decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) for (i = decl->Range.First; i <= decl->Range.Last; i++) { unsigned n = so->outputs_count++; + unsigned ncomp, j; + + ncomp = 4; + so->outputs[n].semantic = decl_semantic(&decl->Semantic); so->outputs[n].regid = regid(i + base, comp); + + /* avoid undefined outputs, stick a dummy mov from imm{0.0}, + * which if the output is actually assigned will be over- + * written + */ + for (j = 0; j < ncomp; j++) + ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0); } } @@ -1410,8 +1830,19 @@ decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl) static void compile_instructions(struct fd3_compile_context *ctx) { - struct ir3_shader *ir = ctx->ir; - int nop = 0; + push_block(ctx); + + /* for fragment shader, we have a single input register (r0.xy) + * which is used as the base for bary.f varying fetch instrs: + */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_instruction *instr; + instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ + ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ + ctx->frag_pos = instr; + } while (!tgsi_parse_end_of_tokens(&ctx->parser)) { tgsi_parse_token(&ctx->parser); @@ -1423,7 +1854,7 @@ compile_instructions(struct fd3_compile_context *ctx) if (decl->Declaration.File == TGSI_FILE_OUTPUT) { decl_out(ctx, decl); } else if (decl->Declaration.File == TGSI_FILE_INPUT) { - nop = decl_in(ctx, decl); + decl_in(ctx, decl); } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) { decl_samp(ctx, decl); } @@ -1446,9 +1877,6 @@ compile_instructions(struct fd3_compile_context *ctx) unsigned opc = inst->Instruction.Opcode; const struct instr_translater *t = &translaters[opc]; - add_nop(ctx, nop); - nop = 0; - if (t->fxn) { t->fxn(t, ctx, inst); ctx->num_internal_temps = 0; @@ -1468,6 +1896,8 @@ compile_instructions(struct fd3_compile_context *ctx) break; } + instr_finish(ctx); + break; } default: @@ -1475,13 +1905,38 @@ compile_instructions(struct fd3_compile_context *ctx) } } - if (ir->instrs_count > 0) - ir->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; + /* fixup actual inputs for frag shader: */ + if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + struct ir3_instruction *instr; + + ctx->block->ninputs = 2; - if (ctx->last_input) - ctx->last_input->flags |= IR3_REG_EI; + /* r0.x */ + instr = create_input(ctx->block, NULL, 0); + ctx->block->inputs[0] = instr; + ctx->frag_pos->regs[1]->instr = instr; - handle_last_rel(ctx); + /* r0.y */ + instr = create_input(ctx->block, NULL, 1); + ctx->block->inputs[1] = instr; + ctx->frag_pos->regs[2]->instr = instr; + } +} + +static void +compile_dump(struct fd3_compile_context *ctx) +{ + const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; + static unsigned n = 0; + char fname[16]; + FILE *f; + snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); + f = fopen(fname, "w"); + if (!f) + return; + ir3_block_depth(ctx->block); + ir3_shader_dump(ctx->ir, name, ctx->block, f); + fclose(f); } int @@ -1489,6 +1944,8 @@ fd3_compile_shader(struct fd3_shader_stateobj *so, const struct tgsi_token *tokens) { struct fd3_compile_context ctx; + unsigned i, actual_in; + int ret = 0; assert(!so->ir); @@ -1496,12 +1953,91 @@ fd3_compile_shader(struct fd3_shader_stateobj *so, assert(so->ir); - if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) - return -1; + if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) { + ret = -1; + goto out; + } compile_instructions(&ctx); + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ret = ir3_block_flatten(ctx.block); + if (ret < 0) + goto out; + if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP)) + compile_dump(&ctx); + + ir3_block_cp(ctx.block); + + if (fd_mesa_debug & FD_DBG_OPTDUMP) + compile_dump(&ctx); + + ir3_block_depth(ctx.block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER DEPTH:\n"); + ir3_dump_instr_list(ctx.block->head); + } + + ir3_block_sched(ctx.block); + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER SCHED:\n"); + ir3_dump_instr_list(ctx.block->head); + } + + ret = ir3_block_ra(ctx.block, so->type); + if (ret) + goto out; + + if (fd_mesa_debug & FD_DBG_OPTMSGS) { + printf("AFTER RA:\n"); + ir3_dump_instr_list(ctx.block->head); + } + + /* fixup input/outputs: */ + for (i = 0; i < so->outputs_count; i++) { + so->outputs[i].regid = ctx.block->outputs[i*4]->regs[0]->num; + /* preserve hack for depth output.. tgsi writes depth to .z, + * but what we give the hw is the scalar register: + */ + if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) && + (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) + so->outputs[i].regid += 2; + } + /* Note that some or all channels of an input may be unused: */ + actual_in = 0; + for (i = 0; i < so->inputs_count; i++) { + unsigned j, regid = ~0, compmask = 0; + for (j = 0; j < 4; j++) { + struct ir3_instruction *in = ctx.block->inputs[(i*4) + j]; + if (in) { + compmask |= (1 << j); + regid = in->regs[0]->num - j; + actual_in++; + } + } + so->inputs[i].regid = regid; + so->inputs[i].compmask = compmask; + } + + /* fragment shader always gets full vec4's even if it doesn't + * fetch all components, but vertex shader we need to update + * with the actual number of components fetch, otherwise thing + * will hang due to mismaptch between VFD_DECODE's and + * TOTALATTRTOVS + */ + if (so->type == SHADER_VERTEX) + so->total_in = actual_in; + +out: + if (ret) { + ir3_shader_destroy(so->ir); + so->ir = NULL; + } compile_free(&ctx); - return 0; + return ret; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c index b5715bea934..9ace26ebdbf 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler_old.c @@ -1161,23 +1161,6 @@ instr_cat2(const struct instr_translater *t, put_dst(ctx, inst, dst); } -static bool is_mad(opc_t opc) -{ - switch (opc) { - case OPC_MAD_U16: - case OPC_MADSH_U16: - case OPC_MAD_S16: - case OPC_MADSH_M16: - case OPC_MAD_U24: - case OPC_MAD_S24: - case OPC_MAD_F16: - case OPC_MAD_F32: - return true; - default: - return false; - } -} - static void instr_cat3(const struct instr_translater *t, struct fd3_compile_context *ctx, diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index d1776144cca..793e703b726 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -80,9 +80,9 @@ fixup_vp_regfootprint(struct fd3_shader_stateobj *so) { unsigned i; for (i = 0; i < so->inputs_count; i++) - so->info.max_reg = MAX2(so->info.max_reg, so->inputs[i].regid >> 2); + so->info.max_reg = MAX2(so->info.max_reg, (so->inputs[i].regid + 3) >> 2); for (i = 0; i < so->outputs_count; i++) - so->info.max_reg = MAX2(so->info.max_reg, so->outputs[i].regid >> 2); + so->info.max_reg = MAX2(so->info.max_reg, (so->outputs[i].regid + 3) >> 2); } static struct fd3_shader_stateobj * diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h index b0f78341131..a79998ef56f 100644 --- a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h @@ -190,6 +190,22 @@ typedef enum { OPC_LDC_4 = 30, OPC_LDLV = 31, + /* meta instructions (category -1): */ + /* placeholder instr to mark inputs/outputs: */ + OPC_META_INPUT = 0, + OPC_META_OUTPUT = 1, + /* The "fan-in" and "fan-out" instructions are used for keeping + * track of instructions that write to multiple dst registers + * (fan-out) like texture sample instructions, or read multiple + * consecutive scalar registers (fan-in) (bary.f, texture samp) + */ + OPC_META_FO = 2, + OPC_META_FI = 3, + /* branches/flow control */ + OPC_META_FLOW = 4, + OPC_META_PHI = 5, + + } opc_t; typedef enum { @@ -643,4 +659,21 @@ static inline uint32_t instr_opc(instr_t *instr) } } +static inline bool is_mad(opc_t opc) +{ + switch (opc) { + case OPC_MAD_U16: + case OPC_MADSH_U16: + case OPC_MAD_S16: + case OPC_MADSH_M16: + case OPC_MAD_U24: + case OPC_MAD_S24: + case OPC_MAD_F16: + case OPC_MAD_F32: + return true; + default: + return false; + } +} + #endif /* INSTR_A3XX_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.c b/src/gallium/drivers/freedreno/a3xx/ir3.c index 2a06d42c7d6..2a68a8e0ba0 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3.c @@ -36,7 +36,7 @@ /* simple allocator to carve allocations out of an up-front allocated heap, * so that we can free everything easily in one shot. */ -static void * ir3_alloc(struct ir3_shader *shader, int sz) +void * ir3_alloc(struct ir3_shader *shader, int sz) { void *ptr = &shader->heap[shader->heap_idx]; shader->heap_idx += align(sz, 4); diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 896bec114fa..ccd3b0b54b4 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -65,6 +65,11 @@ struct ir3_register { * that the shader needs no more input: */ IR3_REG_EI = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ + IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ } flags; union { /* normal registers: @@ -77,6 +82,10 @@ struct ir3_register { float fim_val; /* relative: */ int offset; + /* for IR3_REG_SSA, src registers contain ptr back to + * assigning instruction. + */ + struct ir3_instruction *instr; }; /* used for cat5 instructions, but also for internal/IR level @@ -139,6 +148,10 @@ struct ir3_instruction { IR3_INSTR_P = 0x080, IR3_INSTR_S = 0x100, IR3_INSTR_S2EN = 0x200, + /* meta-flags, for intermediate stages of IR, ie. + * before register assignment is done: + */ + IR3_INSTR_MARK = 0x1000, } flags; int repeat; unsigned regs_count; @@ -171,7 +184,33 @@ struct ir3_instruction { int offset; int iim_val; } cat6; + /* for meta-instructions, just used to hold extra data + * before instruction scheduling, etc + */ + struct { + int off; /* component/offset */ + } fo; + struct { + struct ir3_block *if_block, *else_block; + } flow; + struct { + struct ir3_block *block; + } inout; }; + + /* transient values used during various algorithms: */ + union { + /* The instruction depth is the max dependency distance to output. + * + * You can also think of it as the "cost", if we did any sort of + * optimization for register footprint. Ie. a value that is just + * result of moving a const to a reg would have a low cost, so to + * it could make sense to duplicate the instruction at various + * points where the result is needed to reduce register footprint. + */ + unsigned depth; + }; + struct ir3_instruction *next; #ifdef DEBUG uint32_t serialno; #endif @@ -201,6 +240,7 @@ struct ir3_shader * ir3_shader_create(void); void ir3_shader_destroy(struct ir3_shader *shader); void * ir3_shader_assemble(struct ir3_shader *shader, struct ir3_shader_info *info); +void * ir3_alloc(struct ir3_shader *shader, int sz); struct ir3_block * ir3_block_create(struct ir3_shader *shader, unsigned ntmp, unsigned nin, unsigned nout); @@ -208,11 +248,44 @@ struct ir3_block * ir3_block_create(struct ir3_shader *shader, struct ir3_instruction * ir3_instr_create(struct ir3_block *block, int category, opc_t opc); struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr); +const char *ir3_instr_name(struct ir3_instruction *instr); struct ir3_register * ir3_reg_create(struct ir3_instruction *instr, int num, int flags); +static inline bool ir3_instr_check_mark(struct ir3_instruction *instr) +{ + if (instr->flags & IR3_INSTR_MARK) + return true; /* already visited */ + instr->flags ^= IR3_INSTR_MARK; + return false; +} + +static inline void ir3_shader_clear_mark(struct ir3_shader *shader) +{ + /* TODO would be nice to drop the instruction array.. for + * new compiler, _clear_mark() is all we use it for, and + * we could probably manage a linked list instead.. + */ + unsigned i; + for (i = 0; i < shader->instrs_count; i++) { + struct ir3_instruction *instr = shader->instrs[i]; + instr->flags &= ~IR3_INSTR_MARK; + } +} + +static inline int ir3_instr_regno(struct ir3_instruction *instr, + struct ir3_register *reg) +{ + unsigned i; + for (i = 0; i < instr->regs_count; i++) + if (reg == instr->regs[i]) + return i; + return -1; +} + + /* comp: * 0 - x * 1 - y @@ -254,6 +327,15 @@ static inline bool is_input(struct ir3_instruction *instr) return (instr->category == 2) && (instr->opc == OPC_BARY_F); } +static inline bool is_meta(struct ir3_instruction *instr) +{ + /* TODO how should we count PHI (and maybe fan-in/out) which + * might actually contribute some instructions to the final + * result? + */ + return (instr->category == -1); +} + static inline bool is_gpr(struct ir3_register *reg) { return !(reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)); @@ -262,13 +344,39 @@ static inline bool is_gpr(struct ir3_register *reg) /* TODO combine is_gpr()/reg_gpr().. */ static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA)) return false; if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) return false; return true; } +/* dump: */ +#include <stdio.h> +void ir3_shader_dump(struct ir3_shader *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3_shader? */, + FILE *f); +void ir3_dump_instr_single(struct ir3_instruction *instr); +void ir3_dump_instr_list(struct ir3_instruction *instr); + +/* flatten if/else: */ +int ir3_block_flatten(struct ir3_block *block); + +/* depth calculation: */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n); +void ir3_block_depth(struct ir3_block *block); + +/* copy-propagate: */ +void ir3_block_cp(struct ir3_block *block); + +/* scheduling: */ +void ir3_block_sched(struct ir3_block *block); + +/* register assignment: */ +int ir3_block_ra(struct ir3_block *block, enum shader_t type); + + #ifndef ARRAY_SIZE # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c new file mode 100644 index 00000000000..81f6c902816 --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c @@ -0,0 +1,155 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "ir3.h" + +/* + * Copy Propagate: + * + * TODO probably want some sort of visitor sort of interface to + * avoid duplicating the same graph traversal logic everywhere.. + * + */ + +static void block_cp(struct ir3_block *block); +static struct ir3_instruction * instr_cp(struct ir3_instruction *instr, bool keep); + +static bool is_eligible_mov(struct ir3_instruction *instr) +{ + if ((instr->category == 1) && + (instr->cat1.src_type == instr->cat1.dst_type)) { + struct ir3_register *src = instr->regs[1]; + if ((src->flags & IR3_REG_SSA) && + /* TODO: propagate abs/neg modifiers if possible */ + !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE))) + return true; + } + return false; +} + +static void walk_children(struct ir3_instruction *instr, bool keep) +{ + unsigned i; + + /* walk down the graph from each src: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + src->instr = instr_cp(src->instr, keep); + } +} + +static struct ir3_instruction * +instr_cp_fanin(struct ir3_instruction *instr) +{ + unsigned i; + + /* we need to handle fanin specially, to detect cases + * when we need to keep a mov + */ + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + struct ir3_instruction *cand = + instr_cp(src->instr, false); + + /* if the candidate is a fanout, then keep + * the move. + * + * This is a bit, um, fragile, but it should + * catch the extra mov's that the front-end + * puts in for us already in these cases. + */ + if (is_meta(cand) && (cand->opc == OPC_META_FO)) + cand = instr_cp(src->instr, true); + + src->instr = cand; + } + } + + walk_children(instr, false); + + return instr; + +} + +static struct ir3_instruction * +instr_cp(struct ir3_instruction *instr, bool keep) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return instr; + + if (is_meta(instr) && (instr->opc == OPC_META_FI)) + return instr_cp_fanin(instr); + + if (is_eligible_mov(instr) && !keep) { + struct ir3_register *src = instr->regs[1]; + return instr_cp(src->instr, false); + } + + walk_children(instr, false); + + return instr; +} + +static void block_cp(struct ir3_block *block) +{ + unsigned i, j; + + for (i = 0; i < block->noutputs; i++) { + if (block->outputs[i]) { + struct ir3_instruction *out = + instr_cp(block->outputs[i], false); + + /* To deal with things like this: + * + * 43: MOV OUT[2], TEMP[5] + * 44: MOV OUT[0], TEMP[5] + * + * we need to ensure that no two outputs point to + * the same instruction + */ + for (j = 0; j < i; j++) { + if (block->outputs[j] == out) { + out = instr_cp(block->outputs[i], true); + break; + } + } + + block->outputs[i] = out; + } + } +} + +void ir3_block_cp(struct ir3_block *block) +{ + ir3_shader_clear_mark(block->shader); + block_cp(block); +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c new file mode 100644 index 00000000000..580ae08da2c --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c @@ -0,0 +1,156 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Depth: + * + * Calculates weighted instruction depth, ie. the sum of # of needed + * instructions plus delay slots back to original input (ie INPUT or + * CONST). That is to say, an instructions depth is: + * + * depth(instr) { + * d = 0; + * // for each src register: + * foreach (src in instr->regs[1..n]) + * d = max(d, delayslots(src->instr, n) + depth(src->instr)); + * return d + 1; + * } + * + * After an instruction's depth is calculated, it is inserted into the + * blocks depth sorted list, which is used by the scheduling pass. + */ + +/* calculate required # of delay slots between the instruction that + * assigns a value and the one that consumes + */ +int ir3_delayslots(struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned n) +{ + /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal + * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch + * handled with sync bits + */ + + if (is_meta(assigner)) + return 0; + + /* handled via sync flags: */ + if (is_sfu(assigner) || is_tex(assigner)) + return 0; + + /* assigner must be alu: */ + if (is_sfu(consumer) || is_tex(consumer)) { + return 8; + } else if ((consumer->category == 3) && + is_mad(consumer->opc) && (n == 2)) { + /* special case, 3rd src to cat3 not required on first cycle */ + return 2; + } else { + return 5; + } +} + +static void insert_by_depth(struct ir3_instruction *instr) +{ + struct ir3_block *block = instr->block; + struct ir3_instruction *n = block->head; + struct ir3_instruction *p = NULL; + + while (n && (n != instr) && (n->depth > instr->depth)) { + p = n; + n = n->next; + } + + instr->next = n; + if (p) + p->next = instr; + else + block->head = instr; +} + +static void ir3_instr_depth(struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->depth = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) { + unsigned sd; + + /* visit child to compute it's depth: */ + ir3_instr_depth(src->instr); + + sd = ir3_delayslots(src->instr, instr, i-1) + + src->instr->depth; + + instr->depth = MAX2(instr->depth, sd); + } + } + + /* meta-instructions don't add cycles, other than PHI.. which + * might translate to a real instruction.. + * + * well, not entirely true, fan-in/out, etc might need to need + * to generate some extra mov's in edge cases, etc.. probably + * we might want to do depth calculation considering the worst + * case for these?? + */ + if (!is_meta(instr)) + instr->depth++; + + insert_by_depth(instr); +} + +void ir3_block_depth(struct ir3_block *block) +{ + unsigned i; + + block->head = NULL; + + ir3_shader_clear_mark(block->shader); + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_depth(block->outputs[i]); + + /* at this point, any unvisited input is unused: */ + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in && !ir3_instr_check_mark(in)) + block->inputs[i] = NULL; + } +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c new file mode 100644 index 00000000000..0afd04861a3 --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c @@ -0,0 +1,416 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "ir3.h" + +#define PTRID(x) ((unsigned long)(x)) + +struct ir3_dump_ctx { + FILE *f; + bool verbose; +}; + +static void dump_instr_name(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* for debugging: */ + if (ctx->verbose) { +#ifdef DEBUG + fprintf(ctx->f, "%04u:", instr->serialno); +#endif + fprintf(ctx->f, "%03u: ", instr->depth); + } + + if (is_meta(instr)) { + switch(instr->opc) { + case OPC_META_PHI: + fprintf(ctx->f, "Φ"); + break; + default: + /* shouldn't hit here.. just for debugging: */ + switch (instr->opc) { + case OPC_META_INPUT: fprintf(ctx->f, "_meta:in"); break; + case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out"); break; + case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; + case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; + case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; + case OPC_META_PHI: fprintf(ctx->f, "_meta:phi"); break; + + default: fprintf(ctx->f, "_meta:%d", instr->opc); break; + } + break; + } + } else if (instr->category == 1) { + static const char *type[] = { + [TYPE_F16] = "f16", + [TYPE_F32] = "f32", + [TYPE_U16] = "u16", + [TYPE_U32] = "u32", + [TYPE_S16] = "s16", + [TYPE_S32] = "s32", + [TYPE_U8] = "u8", + [TYPE_S8] = "s8", + }; + if (instr->cat1.src_type == instr->cat1.dst_type) + fprintf(ctx->f, "mov"); + else + fprintf(ctx->f, "cov"); + fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]); + } else { + fprintf(ctx->f, "%s", ir3_instr_name(instr)); + if (instr->flags & IR3_INSTR_3D) + fprintf(ctx->f, ".3d"); + if (instr->flags & IR3_INSTR_A) + fprintf(ctx->f, ".a"); + if (instr->flags & IR3_INSTR_O) + fprintf(ctx->f, ".o"); + if (instr->flags & IR3_INSTR_P) + fprintf(ctx->f, ".p"); + if (instr->flags & IR3_INSTR_S) + fprintf(ctx->f, ".s"); + if (instr->flags & IR3_INSTR_S2EN) + fprintf(ctx->f, ".s2en"); + } +} + +static void dump_reg_name(struct ir3_dump_ctx *ctx, + struct ir3_register *reg) +{ + if ((reg->flags & IR3_REG_ABS) && (reg->flags & IR3_REG_NEGATE)) + fprintf(ctx->f, "(absneg)"); + else if (reg->flags & IR3_REG_NEGATE) + fprintf(ctx->f, "(neg)"); + else if (reg->flags & IR3_REG_ABS) + fprintf(ctx->f, "(abs)"); + + if (reg->flags & IR3_REG_IMMED) { + fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val); + } else if (reg->flags & IR3_REG_SSA) { + if (ctx->verbose) { + fprintf(ctx->f, "_["); + dump_instr_name(ctx, reg->instr); + fprintf(ctx->f, "]"); + } + } else { + if (reg->flags & IR3_REG_HALF) + fprintf(ctx->f, "h"); + if (reg->flags & IR3_REG_CONST) + fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + else + fprintf(ctx->f, "r%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]); + } +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr); +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name); + +static void dump_instr(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if ((instr->opc == OPC_META_FO) || + (instr->opc == OPC_META_FI)) { + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } + } else if (instr->opc == OPC_META_FLOW) { + struct ir3_register *reg = instr->regs[1]; + ir3_block_dump(ctx, instr->flow.if_block, "if"); + if (instr->flow.else_block) + ir3_block_dump(ctx, instr->flow.else_block, "else"); + if (reg->flags & IR3_REG_SSA) + dump_instr(ctx, reg->instr); + } else if (instr->opc == OPC_META_PHI) { + /* treat like a normal instruction: */ + ir3_instr_dump(ctx, instr); + } + } else { + ir3_instr_dump(ctx, instr); + } +} + +/* arrarraggh! if link is to something outside of the current block, we + * need to defer emitting the link until the end of the block, since the + * edge triggers pre-creation of the node it links to inside the cluster, + * even though it is meant to be outside.. + */ +static struct { + char buf[40960]; + unsigned n; +} edge_buf; + +/* helper to print or defer: */ +static void printdef(struct ir3_dump_ctx *ctx, + bool defer, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + if (defer) { + unsigned n = edge_buf.n; + n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n, + fmt, ap); + edge_buf.n = n; + } else { + vfprintf(ctx->f, fmt, ap); + } + va_end(ap); +} + +static void dump_link2(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, const char *target, bool defer) +{ + /* some meta-instructions need to be handled specially: */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_INPUT) { + printdef(ctx, defer, "input%lx:<in%u>:w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if (instr->opc == OPC_META_FO) { + struct ir3_register *reg = instr->regs[1]; + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[instr->fo.off & 0x3]); + } else if (instr->opc == OPC_META_FI) { + unsigned i; + + /* recursively dump all parents and links */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + dump_link2(ctx, reg->instr, target, defer); + printdef(ctx, defer, "[label=\".%c\"]", + "xyzw"[(i - 1) & 0x3]); + } + } + } else if (instr->opc == OPC_META_OUTPUT) { + printdef(ctx, defer, "output%lx:<out%u>:w -> %s", + PTRID(instr->inout.block), + instr->regs[0]->num, target); + } else if (instr->opc == OPC_META_PHI) { + /* treat like a normal instruction: */ + printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); + } + } else { + printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); + } +} + +static void dump_link(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr, + struct ir3_block *block, const char *target) +{ + bool defer = instr->block != block; + dump_link2(ctx, instr, target, defer); + printdef(ctx, defer, "\n"); +} + +static struct ir3_register *follow_flow(struct ir3_register *reg) +{ + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + /* go with the flow.. */ + if (is_meta(instr) && (instr->opc == OPC_META_FLOW)) + return instr->regs[1]; + } + return reg; +} + +static void ir3_instr_dump(struct ir3_dump_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{", + PTRID(instr)); + dump_instr_name(ctx, instr); + + /* destination register: */ + fprintf(ctx->f, "|<dst0>"); + + /* source register(s): */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = follow_flow(instr->regs[i]); + + fprintf(ctx->f, "|"); + + if (reg->flags & IR3_REG_SSA) + fprintf(ctx->f, "<src%u> ", (i - 1)); + + dump_reg_name(ctx, reg); + } + + fprintf(ctx->f, "}\"];\n"); + + /* and recursively dump dependent instructions: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + char target[32]; /* link target */ + + if (!(reg->flags & IR3_REG_SSA)) + continue; + + snprintf(target, sizeof(target), "instr%lx:<src%u>", + PTRID(instr), (i - 1)); + + dump_instr(ctx, reg->instr); + dump_link(ctx, follow_flow(reg)->instr, instr->block, target); + } +} + +static void ir3_block_dump(struct ir3_dump_ctx *ctx, + struct ir3_block *block, const char *name) +{ + unsigned i, n; + + n = edge_buf.n; + + fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block)); + fprintf(ctx->f, "label=\"%s\";\n", name); + + /* draw inputs: */ + fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block)); + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* draw instruction graph: */ + for (i = 0; i < block->noutputs; i++) + dump_instr(ctx, block->outputs[i]); + + /* draw outputs: */ + fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block)); + for (i = 0; i < block->noutputs; i++) + fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]); + fprintf(ctx->f, "\"];\n"); + + /* and links to outputs: */ + for (i = 0; i < block->noutputs; i++) { + char target[32]; /* link target */ + + /* NOTE: there could be outputs that are never assigned, + * so skip them + */ + if (!block->outputs[i]) + continue; + + snprintf(target, sizeof(target), "output%lx:<out%u>:e", + PTRID(block), i); + + dump_link(ctx, block->outputs[i], block, target); + } + + fprintf(ctx->f, "}\n"); + + /* and links to inputs: */ + if (block->parent) { + for (i = 0; i < block->ninputs; i++) { + char target[32]; /* link target */ + + if (!block->inputs[i]) + continue; + + dump_instr(ctx, block->inputs[i]); + + snprintf(target, sizeof(target), "input%lx:<in%u>:e", + PTRID(block), i); + + dump_link(ctx, block->inputs[i], block, target); + } + } + + /* dump deferred edges: */ + if (edge_buf.n > n) { + fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]); + edge_buf.n = n; + } +} + +void ir3_shader_dump(struct ir3_shader *shader, const char *name, + struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3_shader? */, + FILE *f) +{ + struct ir3_dump_ctx ctx = { + .f = f, + }; + ir3_shader_clear_mark(shader); + fprintf(ctx.f, "digraph G {\n"); + fprintf(ctx.f, "rankdir=RL;\n"); + fprintf(ctx.f, "nodesep=0.25;\n"); + fprintf(ctx.f, "ranksep=1.5;\n"); + ir3_block_dump(&ctx, block, name); + fprintf(ctx.f, "}\n"); +} + +/* + * For Debugging: + */ + +void +ir3_dump_instr_single(struct ir3_instruction *instr) +{ + struct ir3_dump_ctx ctx = { + .f = stdout, + .verbose = true, + }; + unsigned i; + + dump_instr_name(&ctx, instr); + for (i = 0; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + printf(i ? ", " : " "); + dump_reg_name(&ctx, reg); + } + printf("\n"); +} + +void +ir3_dump_instr_list(struct ir3_instruction *instr) +{ + unsigned n = 0; + + while (instr) { + ir3_dump_instr_single(instr); + if (!is_meta(instr)) + n++; + instr = instr->next; + } + printf("%u instructions\n", n); +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c b/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c new file mode 100644 index 00000000000..3cf76db5005 --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_flatten.c @@ -0,0 +1,140 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include <stdarg.h> + +#include "ir3.h" + +/* + * Flatten: flatten out legs of if/else, etc + * + * TODO probably should use some heuristic to decide to not flatten + * if one side of the other is too large / deeply nested / whatever? + */ + +struct ir3_flatten_ctx { + struct ir3_block *block; + unsigned cnt; +}; + +static struct ir3_register *unwrap(struct ir3_register *reg) +{ + + if (reg->flags & IR3_REG_SSA) { + struct ir3_instruction *instr = reg->instr; + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_OUTPUT: + case OPC_META_FLOW: + return instr->regs[1]; + default: + break; + } + } + } + return reg; +} + +static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(instr)) + return; + + instr->block = ctx->block; + + /* TODO: maybe some threshold to decide whether to + * flatten or not?? + */ + if (is_meta(instr)) { + if (instr->opc == OPC_META_PHI) { + struct ir3_register *cond, *t, *f; + + /* convert the PHI instruction to sel.{f16,f32} */ + instr->category = 3; + + /* instruction type based on dst size: */ + if (instr->regs[0]->flags & IR3_REG_HALF) + instr->opc = OPC_SEL_F16; + else + instr->opc = OPC_SEL_F32; + + /* swap around src register order, to match what + * hw expects: + */ + cond = instr->regs[1]; + t = instr->regs[2]; /* true val */ + f = instr->regs[3]; /* false val */ + + instr->regs[1] = unwrap(f); + instr->regs[2] = unwrap(cond); + instr->regs[3] = unwrap(t); + + ctx->cnt++; + } else if ((instr->opc == OPC_META_INPUT) && + (instr->regs_count == 2)) { + type_t ftype; + + if (instr->regs[0]->flags & IR3_REG_HALF) + ftype = TYPE_F16; + else + ftype = TYPE_F32; + + /* convert meta:input to mov: */ + instr->category = 1; + instr->cat1.src_type = ftype; + instr->cat1.dst_type = ftype; + } + } + + /* recursively visit children: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *src = instr->regs[i]; + if (src->flags & IR3_REG_SSA) + ir3_instr_flatten(ctx, src->instr); + } +} + +/* return >= 0 is # of phi's flattened, < 0 is error */ +int ir3_block_flatten(struct ir3_block *block) +{ + struct ir3_flatten_ctx ctx = { + .block = block, + }; + unsigned i; + + ir3_shader_clear_mark(block->shader); + for(i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ir3_instr_flatten(&ctx, block->outputs[i]); + + return ctx.cnt; +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c new file mode 100644 index 00000000000..6c868e21791 --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -0,0 +1,580 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#include "pipe/p_shader_tokens.h" +#include "util/u_math.h" + +#include "ir3.h" +#include "ir3_visitor.h" + +/* + * Register Assignment: + * + * NOTE: currently only works on a single basic block.. need to think + * about how multiple basic blocks are going to get scheduled. But + * I think I want to re-arrange how blocks work, ie. get rid of the + * block nesting thing.. + * + * NOTE: we could do register coalescing (eliminate moves) as part of + * the RA step.. OTOH I think we need to do scheduling before register + * assignment. And if we remove a mov that effects scheduling (unless + * we leave a placeholder nop, which seems lame), so I'm not really + * sure how practical this is to do both in a single stage. But OTOH + * I'm not really sure a sane way for the CP stage to realize when it + * cannot remove a mov due to multi-register constraints.. + * + */ + +struct ir3_ra_ctx { + struct ir3_block *block; + enum shader_t type; + int cnt; + bool error; +}; + +struct ir3_ra_assignment { + int8_t off; /* offset of instruction dst within range */ + uint8_t num; /* number of components for the range */ +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num); +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); + +/* + * Register Allocation: + */ + +#define REG(n, wm) (struct ir3_register){ \ + /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \ + .num = (n), \ + .wrmask = TGSI_WRITEMASK_ ## wm, \ + } + +/* check that the register exists, is a GPR and is not special (a0/p0) */ +static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n) +{ + if ((n < instr->regs_count) && reg_gpr(instr->regs[n])) + return instr->regs[n]; + return NULL; +} + +static int output_base(struct ir3_ra_ctx *ctx) +{ + /* ugg, for fragment shader we need to have input at r0.x + * (or at least if there is a way to configure it, I can't + * see how because the blob driver always uses r0.x (ie. + * all zeros) + */ + if (ctx->type == SHADER_FRAGMENT) + return 2; + return 0; +} + +/* live means read before written */ +static void compute_liveregs(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, regmask_t *liveregs) +{ + struct ir3_block *block = instr->block; + regmask_t written; + unsigned i, j; + + regmask_init(liveregs); + regmask_init(&written); + + for (instr = instr->next; instr; instr = instr->next) { + struct ir3_register *r; + + if (is_meta(instr)) + continue; + + /* check first src's read: */ + for (j = 1; j < instr->regs_count; j++) { + r = reg_check(instr, j); + if (r) + regmask_set_if_not(liveregs, r, &written); + } + + /* then dst written (if assigned already): */ + if (instr->flags & IR3_INSTR_MARK) { + r = reg_check(instr, 0); + if (r) + regmask_set(&written, r); + } + } + + /* be sure to account for output registers too: */ + for (i = 0; i < block->noutputs; i++) { + struct ir3_register reg = REG(output_base(ctx) + i, X); + regmask_set_if_not(liveregs, ®, &written); + } +} + +/* calculate registers that are clobbered before last use of 'assigner'. + * This needs to be done backwards, although it could possibly be + * combined into compute_liveregs(). (Ie. compute_liveregs() could + * reverse the list, then do this part backwards reversing the list + * again back to original order.) Otoh, probably I should try to + * construct a proper interference graph instead. + * + * XXX this need to follow the same recursion path that is used for + * to rename/assign registers (ie. ra_assign_src()).. this is a bit + * ugly right now, maybe refactor into node iterator sort of things + * that iterates nodes in the correct order? + */ +static bool compute_clobbers(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, struct ir3_instruction *assigner, + regmask_t *liveregs) +{ + unsigned i; + bool live = false, was_live = false; + + if (instr == NULL) { + struct ir3_block *block = ctx->block; + + /* if at the end, check outputs: */ + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i] == assigner) + return true; + return false; + } + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == assigner)) { + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + // TODO + assert(0); + break; + case OPC_META_FO: + case OPC_META_FI: + was_live |= compute_clobbers(ctx, instr->next, + instr, liveregs); + break; + default: + break; + } + } + live = true; + break; + } + } + + was_live |= compute_clobbers(ctx, instr->next, assigner, liveregs); + + if (was_live && (instr->regs_count > 0) && + (instr->flags & IR3_INSTR_MARK) && + !is_meta(instr)) + regmask_set(liveregs, instr->regs[0]); + + return live || was_live; +} + +static int find_available(regmask_t *liveregs, int size) +{ + unsigned i; + for (i = 0; i < MAX_REG - size; i++) { + if (!regmask_get(liveregs, ®(i, X))) { + unsigned start = i++; + for (; (i < MAX_REG) && ((i - start) < size); i++) + if (regmask_get(liveregs, ®(i, X))) + break; + if ((i - start) >= size) + return start; + } + } + assert(0); + return -1; +} + +static int alloc_block(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, int size) +{ + if (!instr) { + /* special case, allocating shader outputs. At this + * point, nothing is allocated, just start the shader + * outputs at r0.x and let compute_liveregs() take + * care of the rest from here: + */ + return 0; + } else { + regmask_t liveregs; + compute_liveregs(ctx, instr, &liveregs); + + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + // XXX hack.. maybe ra_calc should give us a list of + // instrs to compute_clobbers() on? + if (is_meta(instr) && (instr->opc == OPC_META_INPUT) && + (instr->regs_count == 1)) { + unsigned i, base = instr->regs[0]->num & ~0x3; + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = ctx->block->inputs[base + i]; + if (in) + compute_clobbers(ctx, in->next, in, &liveregs); + } + } else + // XXX XXX XXX XXX XXX XXX XXX XXX XXX + compute_clobbers(ctx, instr->next, instr, &liveregs); + return find_available(&liveregs, size); + } +} + +/* + * Constraint Calculation: + */ + +struct ra_calc_visitor { + struct ir3_visitor base; + struct ir3_ra_assignment a; +}; + +static inline struct ra_calc_visitor *ra_calc_visitor(struct ir3_visitor *v) +{ + return (struct ra_calc_visitor *)v; +} + +/* calculate register assignment for the instruction. If the register + * written by this instruction is required to be part of a range, to + * handle other (input/output/sam/bary.f/etc) contiguous register range + * constraints, that is calculated handled here. + */ +static void ra_calc_dst(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + if (is_tex(instr)) { + c->a.off = 0; + c->a.num = 4; + } else { + c->a.off = 0; + c->a.num = 1; + } +} + +static void +ra_calc_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + struct ir3_block *block = instr->block; + struct ir3_register *dst = instr->regs[0]; + unsigned base = dst->num & ~0x3; + unsigned i, num = 0; + + assert(!(dst->flags & IR3_REG_IA)); + + /* check what input components we need: */ + for (i = 0; i < 4; i++) { + unsigned idx = base + i; + if ((idx < block->ninputs) && block->inputs[idx]) + num = i + 1; + } + + c->a.off = dst->num - base; + c->a.num = num; +} + +static void ra_calc_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_calc_visitor *c = ra_calc_visitor(v); + unsigned srcn = ir3_instr_regno(instr, reg) - 1; + c->a.off -= srcn; + c->a.num += srcn; + c->a.num = MAX2(c->a.num, instr->regs_count - 1); +} + +static const struct ir3_visitor_funcs calc_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_calc_dst_shader_input, + .dst_fanout = ra_calc_dst, + .dst_fanin = ra_calc_dst, + .dst = ra_calc_dst, + .src_fanout = ir3_visit_reg, + .src_fanin = ra_calc_src_fanin, + .src = ir3_visit_reg, +}; + +static struct ir3_ra_assignment ra_calc(struct ir3_instruction *assigner) +{ + struct ra_calc_visitor v = { + .base.funcs = &calc_visitor_funcs, + }; + + ir3_visit_instr(&v.base, assigner); + + return v.a; +} + +/* + * Register Assignment: + */ + +struct ra_assign_visitor { + struct ir3_visitor base; + struct ir3_ra_ctx *ctx; + int num; +}; + +static inline struct ra_assign_visitor *ra_assign_visitor(struct ir3_visitor *v) +{ + return (struct ra_assign_visitor *)v; +} + +static void ra_assign_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + reg->flags &= ~IR3_REG_SSA; + reg->num = a->num; +} + +static void ra_assign_dst_shader_input(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned i, base = reg->num & ~0x3; + int off = base - reg->num; + + ra_assign_reg(v, instr, reg); + reg->flags |= IR3_REG_IA; + + /* trigger assignment of all our companion input components: */ + for (i = 0; i < 4; i++) { + struct ir3_instruction *in = instr->block->inputs[i+base]; + if (in && is_meta(in) && (in->opc == OPC_META_INPUT)) + ra_assign(a->ctx, in, a->num + off + i); + } +} + +static void ra_assign_dst_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + struct ir3_register *src = instr->regs[1]; + ra_assign_reg(v, instr, reg); + if (src->flags & IR3_REG_SSA) + ra_assign(a->ctx, src->instr, a->num - instr->fo.off); +} + +static void ra_assign_src_fanout(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num + instr->fo.off); +} + + +static void ra_assign_src_fanin(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + struct ra_assign_visitor *a = ra_assign_visitor(v); + unsigned j, srcn = ir3_instr_regno(instr, reg) - 1; + ra_assign_reg(v, instr, reg); + ra_assign(a->ctx, instr, a->num - srcn); + for (j = 1; j < instr->regs_count; j++) { + struct ir3_register *reg = instr->regs[j]; + if (reg->flags & IR3_REG_SSA) /* could be renamed already */ + ra_assign(a->ctx, reg->instr, a->num - srcn + j - 1); + } +} + +static const struct ir3_visitor_funcs assign_visitor_funcs = { + .instr = ir3_visit_instr, + .dst_shader_input = ra_assign_dst_shader_input, + .dst_fanout = ra_assign_dst_fanout, + .dst_fanin = ra_assign_reg, + .dst = ra_assign_reg, + .src_fanout = ra_assign_src_fanout, + .src_fanin = ra_assign_src_fanin, + .src = ra_assign_reg, +}; + +static void ra_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *assigner, int num) +{ + struct ra_assign_visitor v = { + .base.funcs = &assign_visitor_funcs, + .ctx = ctx, + .num = num, + }; + + /* if we've already visited this instruction, bail now: */ + if (ir3_instr_check_mark(assigner)) { + debug_assert(assigner->regs[0]->num == num); + if (assigner->regs[0]->num != num) { + /* impossible situation, should have been resolved + * at an earlier stage by inserting extra mov's: + */ + ctx->error = true; + } + return; + } + + ir3_visit_instr(&v.base, assigner); +} + +/* + * + */ + +static void ir3_instr_ra(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_ra_assignment a; + unsigned num; + + /* skip over nop's */ + if (instr->regs_count == 0) + return; + + /* if we've already visited this instruction, bail now: */ + if (instr->flags & IR3_INSTR_MARK) + return; + + /* allocate register(s): */ + a = ra_calc(instr); + num = alloc_block(ctx, instr, a.num) + a.off; + + ra_assign(ctx, instr, num); +} + +/* flatten into shader: */ +// XXX this should probably be somewhere else: +static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + struct ir3_shader *shader = block->shader; + struct ir3_instruction *end = + ir3_instr_create(block, 0, OPC_END); + struct ir3_instruction *last_input = NULL; + regmask_t needs_ss; + regmask_t needs_sy; + + regmask_init(&needs_ss); + regmask_init(&needs_sy); + + shader->instrs_count = 0; + + for (n = block->head; n; n = n->next) { + unsigned i; + + if (is_meta(n)) + continue; + + for (i = 1; i < n->regs_count; i++) { + struct ir3_register *reg = n->regs[i]; + + if (is_gpr(reg)) { + + /* TODO: we probably only need (ss) for alu + * instr consuming sfu result.. need to make + * some tests for both this and (sy).. + */ + if (regmask_get(&needs_ss, reg)) { + n->flags |= IR3_INSTR_SS; + regmask_init(&needs_ss); + } + + if (regmask_get(&needs_sy, reg)) { + n->flags |= IR3_INSTR_SY; + regmask_init(&needs_sy); + } + } + } + + shader->instrs[shader->instrs_count++] = n; + + if (is_sfu(n)) + regmask_set(&needs_ss, n->regs[0]); + if (is_tex(n)) + regmask_set(&needs_sy, n->regs[0]); + if (is_input(n)) + last_input = n; + } + + if (last_input) + last_input->regs[0]->flags |= IR3_REG_EI; + + shader->instrs[shader->instrs_count++] = end; + + shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; +} + +static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *n; + + if (!block->parent) { + unsigned i; + int base, off = output_base(ctx); + + base = alloc_block(ctx, NULL, block->noutputs + off); + + for (i = 0; i < block->noutputs; i++) + if (block->outputs[i]) + ra_assign(ctx, block->outputs[i], base + i + off); + + if (ctx->type == SHADER_FRAGMENT) { + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + ra_assign(ctx, block->inputs[i], base + i); + } else { + for (i = 0; i < block->ninputs; i++) + if (block->inputs[i]) + ir3_instr_ra(ctx, block->inputs[i]); + } + } + + /* then loop over instruction list and assign registers: + */ + n = block->head; + while (n) { + ir3_instr_ra(ctx, n); + if (ctx->error) + return -1; + n = n->next; + } + + legalize(ctx, block); + + return 0; +} + +int ir3_block_ra(struct ir3_block *block, enum shader_t type) +{ + struct ir3_ra_ctx ctx = { + .block = block, + .type = type, + }; + ir3_shader_clear_mark(block->shader); + return block_ra(&ctx, block); +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c new file mode 100644 index 00000000000..5ac33abc548 --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c @@ -0,0 +1,289 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + + +#include "util/u_math.h" + +#include "ir3.h" + +/* + * Instruction Scheduling: + * + * Using the depth sorted list from depth pass, attempt to recursively + * schedule deepest unscheduled path. The first instruction that cannot + * be scheduled, returns the required delay slots it needs, at which + * point we return back up to the top and attempt to schedule by next + * highest depth. After a sufficient number of instructions have been + * scheduled, return back to beginning of list and start again. If you + * reach the end of depth sorted list without being able to insert any + * instruction, insert nop's. Repeat until no more unscheduled + * instructions. + */ + +struct ir3_sched_ctx { + struct ir3_instruction *scheduled; + unsigned cnt; +}; + +static struct ir3_instruction * +deepest(struct ir3_instruction **srcs, unsigned nsrcs) +{ + struct ir3_instruction *d = NULL; + unsigned i = 0, id = 0; + + while ((i < nsrcs) && !(d = srcs[id = i])) + i++; + + if (!d) + return NULL; + + for (; i < nsrcs; i++) + if (srcs[i] && (srcs[i]->depth > d->depth)) + d = srcs[id = i]; + + srcs[id] = NULL; + + return d; +} + +static unsigned distance(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, unsigned maxd) +{ + struct ir3_instruction *n = ctx->scheduled; + unsigned d = 0; + while (n && (n != instr) && (d < maxd)) { + if (!is_meta(n)) + d++; + n = n->next; + } + return d; +} + +/* TODO maybe we want double linked list? */ +static struct ir3_instruction * prev(struct ir3_instruction *instr) +{ + struct ir3_instruction *p = instr->block->head; + while (p && (p->next != instr)) + p = p->next; + return p; +} + +static void schedule(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr, bool remove) +{ + struct ir3_block *block = instr->block; + + /* maybe there is a better way to handle this than just stuffing + * a nop.. ideally we'd know about this constraint in the + * scheduling and depth calculation.. + */ + if (ctx->scheduled && is_sfu(ctx->scheduled) && is_sfu(instr)) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + + /* remove from depth list: + */ + if (remove) { + struct ir3_instruction *p = prev(instr); + + /* NOTE: this can happen for inputs which are not + * read.. in that case there is no need to schedule + * the input, so just bail: + */ + if (instr != (p ? p->next : block->head)) + return; + + if (p) + p->next = instr->next; + else + block->head = instr->next; + } + + instr->flags |= IR3_INSTR_MARK; + + instr->next = ctx->scheduled; + ctx->scheduled = instr; + + ctx->cnt++; +} + +/* + * Delay-slot calculation. Follows fanin/fanout. + */ + +static unsigned delay_calc2(struct ir3_sched_ctx *ctx, + struct ir3_instruction *assigner, + struct ir3_instruction *consumer, unsigned srcn) +{ + unsigned delay = 0; + + if (is_meta(assigner)) { + unsigned i; + for (i = 1; i < assigner->regs_count; i++) { + struct ir3_register *reg = assigner->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + consumer, srcn); + delay = MAX2(delay, d); + } + } + } else { + delay = ir3_delayslots(assigner, consumer, srcn); + delay -= distance(ctx, assigner, delay); + } + + return delay; +} + +static unsigned delay_calc(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i, delay = 0; + + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + unsigned d = delay_calc2(ctx, reg->instr, + instr, i - 1); + delay = MAX2(delay, d); + } + } + + return delay; +} + +/* A negative return value signals that an instruction has been newly + * scheduled, return back up to the top of the stack (to block_sched()) + */ +static int trysched(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_instruction *srcs[ARRAY_SIZE(instr->regs) - 1]; + struct ir3_instruction *src; + unsigned i, delay, nsrcs = 0; + + /* if already scheduled: */ + if (instr->flags & IR3_INSTR_MARK) + return 0; + + /* figure out our src's: */ + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) + srcs[nsrcs++] = reg->instr; + } + + /* for each src register in sorted order: + */ + delay = 0; + while ((src = deepest(srcs, nsrcs))) { + delay = trysched(ctx, src); + if (delay) + return delay; + } + + /* all our dependents are scheduled, figure out if + * we have enough delay slots to schedule ourself: + */ + delay = delay_calc(ctx, instr); + + if (!delay) { + schedule(ctx, instr, true); + return -1; + } + + return delay; +} + +static struct ir3_instruction * reverse(struct ir3_instruction *instr) +{ + struct ir3_instruction *reversed = NULL; + while (instr) { + struct ir3_instruction *next = instr->next; + instr->next = reversed; + reversed = instr; + instr = next; + } + return reversed; +} + +static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) +{ + struct ir3_instruction *instr; + + /* schedule all the shader input's (meta-instr) first so that + * the RA step sees that the input registers contain a value + * from the start of the shader: + */ + if (!block->parent) { + unsigned i; + for (i = 0; i < block->ninputs; i++) { + struct ir3_instruction *in = block->inputs[i]; + if (in) + schedule(ctx, in, true); + } + } + + while ((instr = block->head)) { + /* NOTE: always grab next *before* trysched(), in case the + * instruction is actually scheduled (and therefore moved + * from depth list into scheduled list) + */ + struct ir3_instruction *next = instr->next; + int cnt = trysched(ctx, instr); + /* -1 is signal to return up stack, but to us means same as 0: */ + cnt = MAX2(0, cnt); + cnt += ctx->cnt; + instr = next; + + /* if deepest remaining instruction cannot be scheduled, try + * the increasingly more shallow instructions until needed + * number of delay slots is filled: + */ + while (instr && (cnt > ctx->cnt)) { + next = instr->next; + trysched(ctx, instr); + instr = next; + } + + /* and if we run out of instructions that can be scheduled, + * then it is time for nop's: + */ + while (cnt > ctx->cnt) + schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false); + } + + /* at this point, scheduled list is in reverse order, so fix that: */ + block->head = reverse(ctx->scheduled); +} + +void ir3_block_sched(struct ir3_block *block) +{ + struct ir3_sched_ctx ctx = {0}; + ir3_shader_clear_mark(block->shader); + block_sched(&ctx, block); +} diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h b/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h new file mode 100644 index 00000000000..1c60d1620ca --- /dev/null +++ b/src/gallium/drivers/freedreno/a3xx/ir3_visitor.h @@ -0,0 +1,154 @@ +/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ + +/* + * Copyright (C) 2014 Rob Clark <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark <[email protected]> + */ + +#ifndef IR3_VISITOR_H_ +#define IR3_VISITOR_H_ + +/** + * Visitor which follows dst to src relationships between instructions, + * first visiting the dst (writer) instruction, followed by src (reader) + * instruction(s). + * + * TODO maybe we want multiple different visitors to walk the + * graph in different ways? + */ + +struct ir3_visitor; + +typedef void (*ir3_visit_instr_func)(struct ir3_visitor *v, + struct ir3_instruction *instr); + +typedef void (*ir3_visit_reg_func)(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg); + +struct ir3_visitor_funcs { + ir3_visit_instr_func instr; // TODO do we need?? + + ir3_visit_reg_func dst_shader_input; + ir3_visit_reg_func dst_block_input; + ir3_visit_reg_func dst_fanout; + ir3_visit_reg_func dst_fanin; + ir3_visit_reg_func dst; + + ir3_visit_reg_func src_block_input; + ir3_visit_reg_func src_fanout; + ir3_visit_reg_func src_fanin; + ir3_visit_reg_func src; +}; + +struct ir3_visitor { + const struct ir3_visitor_funcs *funcs; + bool error; +}; + +#include "util/u_debug.h" + +static void visit_instr_dst(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_register *reg = instr->regs[0]; + + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + if (instr->regs_count == 1) + v->funcs->dst_shader_input(v, instr, reg); + else + v->funcs->dst_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->dst_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->dst_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->dst(v, instr, reg); +} + +static void visit_instr_src(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + if (is_meta(instr)) { + switch (instr->opc) { + case OPC_META_INPUT: + /* shader-input does not have a src, only block input: */ + debug_assert(instr->regs_count == 2); + v->funcs->src_block_input(v, instr, reg); + return; + case OPC_META_FO: + v->funcs->src_fanout(v, instr, reg); + return; + case OPC_META_FI: + v->funcs->src_fanin(v, instr, reg); + return; + default: + break; + + } + } + + v->funcs->src(v, instr, reg); +} + +static void ir3_visit_instr(struct ir3_visitor *v, + struct ir3_instruction *instr) +{ + struct ir3_instruction *n; + + /* visit instruction that assigns value: */ + if (instr->regs_count > 0) + visit_instr_dst(v, instr); + + /* and of any following instructions which read that value: */ + n = instr->next; + while (n && !v->error) { + unsigned i; + + for (i = 1; i < n->regs_count; i++) { + struct ir3_register *reg = n->regs[i]; + if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr)) + visit_instr_src(v, n, reg); + } + + n = n->next; + } +} + +static void ir3_visit_reg(struct ir3_visitor *v, + struct ir3_instruction *instr, struct ir3_register *reg) +{ + /* no-op */ +} + +#endif /* IR3_VISITOR_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index e32f97039b7..f6affd6c417 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -68,6 +68,8 @@ static const struct debug_named_value debug_options[] = { {"binning", FD_DBG_BINNING, "Enable hw binning"}, {"dbinning", FD_DBG_DBINNING, "Disable hw binning"}, {"optimize", FD_DBG_OPTIMIZE, "Enable optimization passes in compiler"}, + {"optmsgs", FD_DBG_OPTMSGS, "Enable optimizater debug messages"}, + {"optdump", FD_DBG_OPTDUMP, "Dump shader DAG to .dot files"}, DEBUG_NAMED_VALUE_END }; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index c1262087f75..43026c55754 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -64,6 +64,8 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_BINNING 0x0100 #define FD_DBG_DBINNING 0x0200 #define FD_DBG_OPTIMIZE 0x0400 +#define FD_DBG_OPTMSGS 0x0800 +#define FD_DBG_OPTDUMP 0x1000 extern int fd_mesa_debug; extern bool fd_binning_enabled; |