diff options
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3.h | 22 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_compiler.c | 99 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_cp.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_dump.c | 16 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/ir3/ir3_ra.c | 147 |
5 files changed, 244 insertions, 42 deletions
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 30932854884..430bcf22d6f 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -205,6 +205,9 @@ struct ir3_instruction { int off; /* component/offset */ } fo; struct { + int aid; + } fi; + struct { struct ir3_block *if_block, *else_block; } flow; struct { @@ -264,6 +267,19 @@ struct ir3_instruction { */ struct ir3_instruction *address; + /* in case of a instruction with relative dst instruction, we need to + * capture the dependency on the fanin for the previous values of + * the array elements. Since we don't know at compile time actually + * which array elements are written, this serves to preserve the + * unconditional write to array elements prior to the conditional + * write. + * + * TODO only cat1 can do indirect write.. we could maybe move this + * into instr->cat1.fanin (but would require the frontend to insert + * the extra mov) + */ + struct ir3_instruction *fanin; + struct ir3_instruction *next; #ifdef DEBUG uint32_t serialno; @@ -373,6 +389,8 @@ static inline int ir3_instr_regno(struct ir3_instruction *instr, } +#define MAX_ARRAYS 16 + /* comp: * 0 - x * 1 - y @@ -498,6 +516,8 @@ static inline bool reg_gpr(struct ir3_register *r) static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) { + if (instr->fanin) + return instr->regs_count + 2; if (instr->address) return instr->regs_count + 1; return instr->regs_count; @@ -505,6 +525,8 @@ static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr) static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n) { + if (n == (instr->regs_count + 1)) + return instr->fanin; if (n == (instr->regs_count + 0)) return instr->address; return ssa(instr->regs[n]); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c index d755babf31b..df428ebed7e 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c @@ -64,7 +64,7 @@ struct ir3_compile_context { */ struct { struct ir3_instruction *instr, **instrp; - } output_updates[16]; + } output_updates[64]; unsigned num_output_updates; /* are we in a sequence of "atomic" instructions? @@ -97,7 +97,7 @@ struct ir3_compile_context { struct { unsigned first, last; struct ir3_instruction *fanin; - } array[16]; + } array[MAX_ARRAYS]; uint32_t array_dirty; /* offset into array[], per file, of first array info */ uint8_t array_offsets[TGSI_FILE_COUNT]; @@ -247,10 +247,6 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets)); #define FM(x) (1 << TGSI_FILE_##x) - /* optimize can't deal with relative addressing: */ - if (info->indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) - return TGSI_PARSE_ERROR; - /* NOTE: if relative addressing is used, we set constlen in * the compiler (to worst-case value) since we don't know in * the assembler what the max addr reg value can be: @@ -595,6 +591,16 @@ ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n) return instr; } +static int dst_array_id(struct ir3_compile_context *ctx, + const struct tgsi_dst_register *dst) +{ + // XXX complete hack to recover tgsi_full_dst_register... + // nothing that isn't wrapped in a tgsi_full_dst_register + // should be indirect + const struct tgsi_full_dst_register *fdst = (const void *)dst; + return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File]; +} + static int src_array_id(struct ir3_compile_context *ctx, const struct tgsi_src_register *src) { @@ -639,7 +645,56 @@ static void ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr, const struct tgsi_dst_register *dst, unsigned chan) { - ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr); + if (dst->Indirect) { + struct ir3_register *reg = instr->regs[0]; + unsigned i, aid = dst_array_id(ctx, dst); + unsigned first = ctx->array[aid].first; + unsigned last = ctx->array[aid].last; + unsigned off = dst->Index - first; /* vec4 offset */ + + reg->size = 4 * (1 + last - first); + reg->offset = regid(off, chan); + + instr->fanin = array_fanin(ctx, aid, dst->File); + + /* annotate with the array-id, to help out the register- + * assignment stage. At least for the case of indirect + * writes, we should capture enough dependencies to + * preserve the order of reads/writes of the array, so + * the multiple "names" for the array should end up all + * assigned to the same registers. + */ + instr->fanin->fi.aid = aid; + + /* Since we are scalarizing vec4 tgsi instructions/regs, we + * run into a slight complication here. To do the naive thing + * and setup a fanout for each scalar array element would end + * up with the result that the instructions generated for each + * component of the vec4 would end up clobbering each other. + * So we take advantage here of knowing that the array index + * (after the shl.b) will be a multiple of four, and only set + * every fourth scalar component in the array. See also + * fixup_ssa_dst_array() + */ + for (i = first; i <= last; i++) { + struct ir3_instruction *split; + unsigned n = regid(i, chan); + int off = (4 * (i - first)) + chan; + + if (is_meta(instr) && (instr->opc == OPC_META_FO)) + off -= instr->fo.off; + + split = ir3_instr_create(ctx->block, -1, OPC_META_FO); + split->fo.off = off; + ir3_reg_create(split, 0, 0); + ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr; + + ssa_instr_set(ctx, dst->File, n, split); + } + } else { + /* normal case (not relative addressed GPR) */ + ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr); + } } static void @@ -705,12 +760,22 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx, break; } - if (dst->Indirect) + if (dst->Indirect) { flags |= IR3_REG_RELATIV; - reg = ir3_reg_create(instr, regid(num, chan), flags); + /* shouldn't happen, and we can't cope with it below: */ + compile_assert(ctx, wrmask == 0x1); + + compile_assert(ctx, ctx->block->address); + if (instr->address) + compile_assert(ctx, ctx->block->address == instr->address); + + instr->address = ctx->block->address; + } + reg = ir3_reg_create(instr, regid(num, chan), flags); reg->wrmask = wrmask; + if (wrmask == 0x1) { /* normal case */ ssa_dst(ctx, instr, dst, chan); @@ -720,6 +785,8 @@ add_dst_reg_wrmask(struct ir3_compile_context *ctx, struct ir3_instruction *prev = NULL; unsigned i; + compile_assert(ctx, !dst->Indirect); + /* if instruction writes multiple, we need to create * some place-holder collect the registers: */ @@ -2539,10 +2606,16 @@ instr_cat1(const struct instr_translater *t, struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst) { - struct tgsi_dst_register *dst = get_dst(ctx, inst); + struct tgsi_dst_register *dst = &inst->Dst[0].Register; struct tgsi_src_register *src = &inst->Src[0].Register; + + /* NOTE: atomic start/end, rather than in create_mov() since + * create_mov() is used already w/in atomic sequences (and + * we aren't clever enough to deal with the nesting) + */ + instr_atomic_start(ctx); create_mov(ctx, dst, src); - put_dst(ctx, inst, dst); + instr_atomic_end(ctx); } static void @@ -3322,6 +3395,10 @@ ir3_compile_shader(struct ir3_shader_variant *so, goto out; } + /* for now, until the edge cases are worked out: */ + if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) + cp = false; + compile_instructions(&ctx); block = ctx.block; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 898ed70abeb..8ad1894a324 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -49,7 +49,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr) struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; struct ir3_instruction *src_instr = ssa(src); - if (dst->flags & IR3_REG_ADDR) + if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV)) return false; /* TODO: propagate abs/neg modifiers if possible */ if (src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV)) diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c index a846777b879..dc251653fa5 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_dump.c @@ -414,8 +414,20 @@ ir3_dump_instr_single(struct ir3_instruction *instr) fprintf(ctx.f, "]"); } - if (is_meta(instr) && (instr->opc == OPC_META_FO)) - printf(", off=%d", instr->fo.off); + if (instr->fanin) { + fprintf(ctx.f, ", fanin=_"); + fprintf(ctx.f, "["); + dump_instr_name(&ctx, instr->fanin); + fprintf(ctx.f, "]"); + } + + if (is_meta(instr)) { + if (instr->opc == OPC_META_FO) { + printf(", off=%d", instr->fo.off); + } else if ((instr->opc == OPC_META_FI) && instr->fi.aid) { + printf(", aid=%d", instr->fi.aid); + } + } printf("\n"); } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c index 0f6d40f5a7c..a4235a77a15 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c @@ -47,6 +47,12 @@ * I'm not really sure a sane way for the CP stage to realize when it * cannot remove a mov due to multi-register constraints.. * + * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has + * some ideas to handle array allocation with a more conventional + * graph coloring algorithm for register assignment, which might be + * a good alternative to the current algo. However afaict it cannot + * handle overlapping arrays, which is a scenario that we have to + * deal with */ struct ir3_ra_ctx { @@ -56,6 +62,10 @@ struct ir3_ra_ctx { bool frag_face; int cnt; bool error; + struct { + unsigned base; + unsigned size; + } arrays[MAX_ARRAYS]; }; #ifdef DEBUG @@ -141,6 +151,26 @@ static bool instr_is_output(struct ir3_instruction *instr) return false; } +static void mark_sources(struct ir3_instruction *instr, + struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written) +{ + unsigned i; + + for (i = 1; i < n->regs_count; i++) { + struct ir3_register *r = reg_check(n, i); + if (r) + regmask_set_if_not(liveregs, r, written); + + /* if any src points back to the instruction(s) in + * the block of neighbors that we are assigning then + * mark any written (clobbered) registers as live: + */ + if (instr_used_by(instr, n->regs[i])) + regmask_or(liveregs, liveregs, written); + } + +} + /* live means read before written */ static void compute_liveregs(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, regmask_t *liveregs) @@ -159,18 +189,13 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx, continue; /* check first src's read: */ - for (i = 1; i < n->regs_count; i++) { - r = reg_check(n, i); - if (r) - regmask_set_if_not(liveregs, r, &written); - - /* if any src points back to the instruction(s) in - * the block of neighbors that we are assigning then - * mark any written (clobbered) registers as live: - */ - if (instr_used_by(instr, n->regs[i])) - regmask_or(liveregs, liveregs, &written); - } + mark_sources(instr, n, liveregs, &written); + + /* for instructions that write to an array, we need to + * capture the dependency on the array elements: + */ + if (n->fanin) + mark_sources(instr, n->fanin, liveregs, &written); /* meta-instructions don't actually get scheduled, * so don't let it's write confuse us.. what we @@ -383,14 +408,32 @@ static void instr_assign_src(struct ir3_ra_ctx *ctx, } } -static void instr_assign(struct ir3_ra_ctx *ctx, +static void instr_assign_srcs(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned name) { struct ir3_instruction *n, *src; + + for (n = instr->next; n && !ctx->error; n = n->next) { + foreach_ssa_src_n(src, i, n) { + unsigned r = i + 1; + + /* skip address / etc (non real sources): */ + if (r >= n->regs_count) + continue; + + if (src == instr) + instr_assign_src(ctx, n, r, name); + } + } +} + +static void instr_assign(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr, unsigned name) +{ struct ir3_register *reg = instr->regs[0]; - if ((reg->flags & IR3_REG_RELATIV)) - name += reg->offset; + if (reg->flags & IR3_REG_RELATIV) + return; /* check if already assigned: */ if (!(reg->flags & IR3_REG_SSA)) { @@ -403,18 +446,7 @@ static void instr_assign(struct ir3_ra_ctx *ctx, reg_assign(instr, 0, name); /* and rename any subsequent use of result of this instr: */ - for (n = instr->next; n && !ctx->error; n = n->next) { - foreach_ssa_src_n(src, i, n) { - unsigned r = i + 1; - - /* skip address / etc (non real sources): */ - if (r >= n->regs_count) - continue; - - if (src == instr) - instr_assign_src(ctx, n, r, name); - } - } + instr_assign_srcs(ctx, instr, name); /* To simplify the neighbor logic, and to "avoid" dealing with * instructions which write more than one output, we actually @@ -423,6 +455,8 @@ static void instr_assign(struct ir3_ra_ctx *ctx, * to the actual instruction: */ if (is_meta(instr) && (instr->opc == OPC_META_FO)) { + struct ir3_instruction *src; + debug_assert(name >= instr->fo.off); foreach_ssa_src(src, instr) @@ -444,7 +478,8 @@ static int check_partial_assignment(struct ir3_ra_ctx *ctx, for (n = instr; n; n = n->cp.right) { struct ir3_register *dst = n->regs[0]; - if (!(dst->flags & IR3_REG_SSA)) { + if ((n->depth != DEPTH_UNUSED) && + !(dst->flags & IR3_REG_SSA)) { int name = dst->num - off; debug_assert(name >= 0); return name; @@ -472,6 +507,23 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx, dst = instr->regs[0]; + /* For indirect dst, take the register assignment from the + * fanin and propagate it forward. + */ + if (dst->flags & IR3_REG_RELATIV) { + /* NOTE can be grouped, if for example outputs: + * for now disable cp if indirect writes + */ + instr_alloc_and_assign(ctx, instr->fanin); + + dst->num += instr->fanin->regs[0]->num; + dst->flags &= ~IR3_REG_SSA; + + instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num); + + return; + } + /* for instructions w/ fanouts, do the actual register assignment * on the group of fanout neighbor nodes and propagate the reg * name back up to the texture instruction. @@ -510,6 +562,33 @@ static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx, } } +static void instr_assign_array(struct ir3_ra_ctx *ctx, + struct ir3_instruction *instr) +{ + struct ir3_instruction *src; + int name, aid = instr->fi.aid; + + if (ctx->arrays[aid].base == ~0) { + int size = instr->regs_count - 1; + ctx->arrays[aid].base = alloc_block(ctx, instr, size); + ctx->arrays[aid].size = size; + } + + name = ctx->arrays[aid].base; + + foreach_ssa_src_n(src, i, instr) { + unsigned r = i + 1; + + /* skip address / etc (non real sources): */ + if (r >= instr->regs_count) + break; + + instr_assign(ctx, src, name); + name++; + } + +} + static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) { struct ir3_instruction *n; @@ -531,6 +610,16 @@ static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block) ra_dump_list("-------\n", block->head); + /* first pass, assign arrays: */ + for (n = block->head; n && !ctx->error; n = n->next) { + if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) { + debug_assert(!n->cp.left); /* don't think this should happen */ + ra_dump_instr("ASSIGN ARRAY: ", n); + instr_assign_array(ctx, n); + ra_dump_list("-------\n", block->head); + } + } + for (n = block->head; n && !ctx->error; n = n->next) { ra_dump_instr("ASSIGN: ", n); instr_alloc_and_assign(ctx, ir3_neighbor_first(n)); @@ -552,6 +641,8 @@ int ir3_block_ra(struct ir3_block *block, enum shader_t type, }; int ret; + memset(&ctx.arrays, ~0, sizeof(ctx.arrays)); + /* mark dst registers w/ SSA flag so we can see which * have been assigned so far: * NOTE: we really should set SSA flag consistently on |