diff options
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_compiler.c | 86 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/instr-a3xx.h | 12 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_cp.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_depth.c | 3 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_dump.c | 10 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_ra.c | 55 | ||||
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/ir3_sched.c | 81 |
8 files changed, 203 insertions, 68 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index cee446a9fa8..1138ec9be34 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -192,8 +192,7 @@ compile_init(struct fd3_compile_context *ctx, struct fd3_shader_variant *so, #define FM(x) (1 << TGSI_FILE_##x) /* optimize can't deal with relative addressing: */ - if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | - FM(OUTPUT) | FM(IMMEDIATE) | FM(CONSTANT))) + if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT))) return TGSI_PARSE_ERROR; /* Immediates go after constants: */ @@ -414,14 +413,7 @@ block_temporary(struct ir3_block *block, unsigned n) static struct ir3_instruction * create_immed(struct fd3_compile_context *ctx, float val) { - /* this can happen when registers (or components of a TGSI - * register) are used as src before they have been assigned - * (undefined contents). To avoid confusing the rest of the - * compiler, and to generally keep things peachy, substitute - * an instruction that sets the src to 0.0. Or to keep - * things undefined, I could plug in a random number? :-P - * - * NOTE: *don't* use instr_create() here! + /* NOTE: *don't* use instr_create() here! */ struct ir3_instruction *instr; instr = ir3_instr_create(ctx->block, 1, 0); @@ -464,6 +456,12 @@ ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr, ctx->output_updates[idx].instr = instr; ctx->num_output_updates++; break; + case TGSI_FILE_ADDRESS: + compile_assert(ctx, n < 1); + ctx->output_updates[idx].instrp = &ctx->block->address; + ctx->output_updates[idx].instr = instr; + ctx->num_output_updates++; + break; } } @@ -526,7 +524,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx, /* uses SSA */ break; case TGSI_FILE_ADDRESS: - num = REG_A0; + flags |= IR3_REG_ADDR; + /* uses SSA */ break; default: compile_error(ctx, "unsupported dst register file: %s\n", @@ -553,7 +552,8 @@ add_dst_reg_wrmask(struct fd3_compile_context *ctx, if (!ctx->atomic) ssa_dst(ctx, instr, dst, chan); } else if ((dst->File == TGSI_FILE_TEMPORARY) || - (dst->File == TGSI_FILE_OUTPUT)) { + (dst->File == TGSI_FILE_OUTPUT) || + (dst->File == TGSI_FILE_ADDRESS)) { unsigned i; /* if instruction writes multiple, we need to create @@ -591,6 +591,7 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, { unsigned flags = 0, num = 0; struct ir3_register *reg; + struct ir3_instruction *orig = NULL; /* TODO we need to use a mov to temp for const >= 64.. or maybe * we could use relative addressing.. @@ -628,9 +629,21 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, flags |= IR3_REG_ABS; if (src->Negate) flags |= IR3_REG_NEGATE; - if (src->Indirect) + + if (src->Indirect) { flags |= IR3_REG_RELATIV; + /* shouldn't happen, and we can't cope with it below: */ + compile_assert(ctx, wrmask == 0x1); + + /* wrap in a meta-deref to track both the src and address: */ + orig = instr; + + instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF); + ir3_reg_create(instr, 0, 0); + ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address; + } + reg = ir3_reg_create(instr, regid(num, chan), flags); reg->wrmask = wrmask; @@ -643,6 +656,8 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, struct ir3_instruction *collect; unsigned i; + compile_assert(ctx, !src->Indirect); + /* if instruction reads multiple, we need to create * some place-holder collect the registers: */ @@ -666,6 +681,10 @@ add_src_reg_wrmask(struct fd3_compile_context *ctx, reg->instr = collect; } + if (src->Indirect) { + reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA); + reg->instr = instr; + } return reg; } @@ -718,36 +737,6 @@ get_internal_temp(struct fd3_compile_context *ctx, return tmp_src; } -/* Get internal half-precision temp src/dst to use for a sequence of - * instructions generated by a single TGSI op. - */ -static struct tgsi_src_register * -get_internal_temp_hr(struct fd3_compile_context *ctx, - struct tgsi_dst_register *tmp_dst) -{ - struct tgsi_src_register *tmp_src; - int n; - - tmp_dst->File = TGSI_FILE_TEMPORARY; - tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW; - tmp_dst->Indirect = 0; - tmp_dst->Dimension = 0; - - /* assign next temporary: */ - n = ctx->num_internal_temps++; - compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps)); - tmp_src = &ctx->internal_temps[n]; - - /* just use hr0 because no one else should be using half- - * precision regs: - */ - tmp_dst->Index = 0; - - src_from_dst(tmp_src, tmp_dst); - - return tmp_src; -} - static inline bool is_const(struct tgsi_src_register *src) { @@ -1049,11 +1038,18 @@ trans_arl(const struct instr_translater *t, struct tgsi_dst_register *dst = &inst->Dst[0].Register; struct tgsi_src_register *src = &inst->Src[0].Register; unsigned chan = src->SwizzleX; + compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS); - tmp_src = get_internal_temp_hr(ctx, &tmp_dst); + /* NOTE: we allocate a temporary from a flat register + * namespace (ignoring half vs full). It turns out + * not to really matter since registers get reassigned + * later in ir3_ra which (hopefully!) can deal a bit + * better with mixed half and full precision. + */ + tmp_src = get_internal_temp(ctx, &tmp_dst); - /* cov.{f32,f16}s16 Rtmp, Rsrc */ + /* cov.f{32,16}s16 Rtmp, Rsrc */ instr = instr_create(ctx, 1, 0); instr->cat1.src_type = get_ftype(ctx); instr->cat1.dst_type = TYPE_S16; diff --git a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h index a79998ef56f..c67f1037ced 100644 --- a/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h +++ b/src/gallium/drivers/freedreno/a3xx/instr-a3xx.h @@ -204,6 +204,8 @@ typedef enum { /* branches/flow control */ OPC_META_FLOW = 4, OPC_META_PHI = 5, + /* relative addressing */ + OPC_META_DEREF = 6, } opc_t; @@ -244,6 +246,16 @@ static inline int type_float(type_t type) return (type == TYPE_F32) || (type == TYPE_F16); } +static inline int type_uint(type_t type) +{ + return (type == TYPE_U32) || (type == TYPE_U16) || (type == TYPE_U8); +} + +static inline int type_sint(type_t type) +{ + return (type == TYPE_S32) || (type == TYPE_S16) || (type == TYPE_S8); +} + typedef union PACKED { /* normal gpr or const src register: */ struct PACKED { diff --git a/src/gallium/drivers/freedreno/a3xx/ir3.h b/src/gallium/drivers/freedreno/a3xx/ir3.h index 872f47883bb..9ec05da6ae4 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3.h +++ b/src/gallium/drivers/freedreno/a3xx/ir3.h @@ -70,6 +70,7 @@ struct ir3_register { */ IR3_REG_SSA = 0x1000, /* 'instr' is ptr to assigning instr */ IR3_REG_IA = 0x2000, /* meta-input dst is "assigned" */ + IR3_REG_ADDR = 0x4000, /* register is a0.x */ } flags; union { /* normal registers: @@ -232,6 +233,8 @@ struct ir3_block { struct ir3_instruction **temporaries; struct ir3_instruction **inputs; struct ir3_instruction **outputs; + /* only a single address register: */ + struct ir3_instruction *address; struct ir3_block *parent; struct ir3_instruction *head; }; @@ -351,10 +354,24 @@ static inline bool is_meta(struct ir3_instruction *instr) return (instr->category == -1); } +static inline bool is_deref(struct ir3_instruction *instr) +{ + return is_meta(instr) && (instr->opc == OPC_META_DEREF); +} + +static inline bool writes_addr(struct ir3_instruction *instr) +{ + if (instr->regs_count > 0) { + struct ir3_register *dst = instr->regs[0]; + return !!(dst->flags & IR3_REG_ADDR); + } + return false; +} + /* TODO combine is_gpr()/reg_gpr().. */ static inline bool reg_gpr(struct ir3_register *r) { - if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA)) + if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_SSA | IR3_REG_ADDR)) return false; if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) return false; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c index 81f6c902816..0faed89c25e 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_cp.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_cp.c @@ -43,10 +43,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr) { if ((instr->category == 1) && (instr->cat1.src_type == instr->cat1.dst_type)) { + struct ir3_register *dst = instr->regs[0]; struct ir3_register *src = instr->regs[1]; + if (dst->flags & IR3_REG_ADDR) + return false; if ((src->flags & IR3_REG_SSA) && /* TODO: propagate abs/neg modifiers if possible */ - !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE))) + !(src->flags & (IR3_REG_ABS | IR3_REG_NEGATE | IR3_REG_RELATIV))) return true; } return false; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c index 1715f1917f0..b84629b2e07 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_depth.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_depth.c @@ -63,6 +63,9 @@ int ir3_delayslots(struct ir3_instruction *assigner, if (is_meta(assigner)) return 0; + if (writes_addr(assigner)) + return 6; + /* handled via sync flags: */ if (is_sfu(assigner) || is_tex(assigner)) return 0; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c index 3984cd60e6e..a186d62a819 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_dump.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_dump.c @@ -58,6 +58,9 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx, case OPC_META_PHI: fprintf(ctx->f, "Φ"); break; + case OPC_META_DEREF: + fprintf(ctx->f, "(*)"); + break; default: /* shouldn't hit here.. just for debugging: */ switch (instr->opc) { @@ -66,7 +69,6 @@ static void dump_instr_name(struct ir3_dump_ctx *ctx, case OPC_META_FO: fprintf(ctx->f, "_meta:fo"); break; case OPC_META_FI: fprintf(ctx->f, "_meta:fi"); break; case OPC_META_FLOW: fprintf(ctx->f, "_meta:flow"); break; - case OPC_META_PHI: fprintf(ctx->f, "_meta:phi"); break; default: fprintf(ctx->f, "_meta:%d", instr->opc); break; } @@ -162,7 +164,8 @@ static void dump_instr(struct ir3_dump_ctx *ctx, ir3_block_dump(ctx, instr->flow.else_block, "else"); if (reg->flags & IR3_REG_SSA) dump_instr(ctx, reg->instr); - } else if (instr->opc == OPC_META_PHI) { + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { /* treat like a normal instruction: */ ir3_instr_dump(ctx, instr); } @@ -228,7 +231,8 @@ static void dump_link2(struct ir3_dump_ctx *ctx, printdef(ctx, defer, "output%lx:<out%u>:w -> %s", PTRID(instr->inout.block), instr->regs[0]->num, target); - } else if (instr->opc == OPC_META_PHI) { + } else if ((instr->opc == OPC_META_PHI) || + (instr->opc == OPC_META_DEREF)) { /* treat like a normal instruction: */ printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target); } diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c index 57c68c729c5..a9a510f3bc2 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_ra.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_ra.c @@ -82,8 +82,8 @@ static struct ir3_ra_assignment ra_calc(struct ir3_instruction *instr); * Register Allocation: */ -#define REG(n, wm) (struct ir3_register){ \ - /*.flags = ((so)->half_precision) ? IR3_REG_HALF : 0,*/ \ +#define REG(n, wm, f) (struct ir3_register){ \ + .flags = (f), \ .num = (n), \ .wrmask = TGSI_WRITEMASK_ ## wm, \ } @@ -145,7 +145,7 @@ static void compute_liveregs(struct ir3_ra_ctx *ctx, /* be sure to account for output registers too: */ for (i = 0; i < block->noutputs; i++) { - struct ir3_register reg = REG(output_base(ctx) + i, X); + struct ir3_register reg = REG(output_base(ctx) + i, X, 0); regmask_set_if_not(liveregs, ®, &written); } } @@ -212,14 +212,15 @@ static bool compute_clobbers(struct ir3_ra_ctx *ctx, return live || was_live; } -static int find_available(regmask_t *liveregs, int size) +static int find_available(regmask_t *liveregs, int size, bool half) { unsigned i; + unsigned f = half ? IR3_REG_HALF : 0; for (i = 0; i < MAX_REG - size; i++) { - if (!regmask_get(liveregs, ®(i, X))) { + if (!regmask_get(liveregs, ®(i, X, f))) { unsigned start = i++; for (; (i < MAX_REG) && ((i - start) < size); i++) - if (regmask_get(liveregs, ®(i, X))) + if (regmask_get(liveregs, ®(i, X, f))) break; if ((i - start) >= size) return start; @@ -240,7 +241,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx, */ return 0; } else { + struct ir3_register *dst = instr->regs[0]; regmask_t liveregs; + compute_liveregs(ctx, instr, &liveregs); // XXX XXX XXX XXX XXX XXX XXX XXX XXX @@ -257,7 +260,9 @@ static int alloc_block(struct ir3_ra_ctx *ctx, } else // XXX XXX XXX XXX XXX XXX XXX XXX XXX compute_clobbers(ctx, instr->next, instr, &liveregs); - return find_available(&liveregs, size); + + return find_available(&liveregs, size, + !!(dst->flags & IR3_REG_HALF)); } } @@ -547,24 +552,32 @@ static void ra_assign(struct ir3_ra_ctx *ctx, static void ir3_instr_ra(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr) { - struct ir3_ra_assignment a; + struct ir3_register *dst; unsigned num; /* skip over nop's */ if (instr->regs_count == 0) return; - /* skip writes to a0, p0, etc */ - if (!reg_gpr(instr->regs[0])) - return; + dst = instr->regs[0]; /* if we've already visited this instruction, bail now: */ if (instr->flags & IR3_INSTR_MARK) return; /* allocate register(s): */ - a = ra_calc(instr); - num = alloc_block(ctx, instr, a.num) + a.off; + if (is_deref(instr)) { + num = instr->regs[2]->num; + } else if (reg_gpr(dst)) { + struct ir3_ra_assignment a; + a = ra_calc(instr); + num = alloc_block(ctx, instr, a.num) + a.off; + } else if (dst->flags & IR3_REG_ADDR) { + dst->flags &= ~IR3_REG_ADDR; + num = regid(REG_A0, 0) | REG_HALF; + } else { + assert(0); + } ra_assign(ctx, instr, num); } @@ -578,6 +591,7 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) struct ir3_instruction *end = ir3_instr_create(block, 0, OPC_END); struct ir3_instruction *last_input = NULL; + struct ir3_instruction *last_rel = NULL; regmask_t needs_ss_war; /* write after read */ regmask_t needs_ss; regmask_t needs_sy; @@ -614,6 +628,13 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) regmask_init(&needs_sy); } } + + /* TODO: is it valid to have address reg loaded from a + * relative src (ie. mova a0, c<a0.x+4>)? If so, the + * last_rel check below should be moved ahead of this: + */ + if (reg->flags & IR3_REG_RELATIV) + last_rel = n; } if (n->regs_count > 0) { @@ -622,6 +643,11 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) n->flags |= IR3_INSTR_SS; regmask_init(&needs_ss_war); // ??? I assume? } + + if (last_rel && (reg->num == regid(REG_A0, 0))) { + last_rel->flags |= IR3_INSTR_UL; + last_rel = NULL; + } } /* cat5+ does not have an (ss) bit, if needed we need to @@ -685,6 +711,9 @@ static void legalize(struct ir3_ra_ctx *ctx, struct ir3_block *block) if (last_input) last_input->regs[0]->flags |= IR3_REG_EI; + if (last_rel) + last_rel->flags |= IR3_INSTR_UL; + shader->instrs[shader->instrs_count++] = end; shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY; diff --git a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c index 5e585271f92..4fd3da58b46 100644 --- a/src/gallium/drivers/freedreno/a3xx/ir3_sched.c +++ b/src/gallium/drivers/freedreno/a3xx/ir3_sched.c @@ -31,6 +31,11 @@ #include "ir3.h" +enum { + SCHEDULED = -1, + DELAYED = -2, +}; + /* * Instruction Scheduling: * @@ -46,7 +51,8 @@ */ struct ir3_sched_ctx { - struct ir3_instruction *scheduled; + struct ir3_instruction *scheduled; /* last scheduled instr */ + struct ir3_instruction *deref; /* current deref, if any */ unsigned cnt; }; @@ -123,6 +129,11 @@ static void schedule(struct ir3_sched_ctx *ctx, block->head = instr->next; } + if (writes_addr(instr)) { + assert(ctx->deref == NULL); + ctx->deref = instr; + } + instr->flags |= IR3_INSTR_MARK; instr->next = ctx->scheduled; @@ -210,13 +221,19 @@ static int trysched(struct ir3_sched_ctx *ctx, * we have enough delay slots to schedule ourself: */ delay = delay_calc(ctx, instr); + if (delay) + return delay; - if (!delay) { - schedule(ctx, instr, true); - return -1; + /* if this is a write to address register, and addr register + * is currently in use, we need to defer until it is free: + */ + if (writes_addr(instr) && ctx->deref) { + assert(ctx->deref != instr); + return DELAYED; } - return delay; + schedule(ctx, instr, true); + return SCHEDULED; } static struct ir3_instruction * reverse(struct ir3_instruction *instr) @@ -231,6 +248,56 @@ static struct ir3_instruction * reverse(struct ir3_instruction *instr) return reversed; } +static bool uses_current_deref(struct ir3_sched_ctx *ctx, + struct ir3_instruction *instr) +{ + unsigned i; + for (i = 1; i < instr->regs_count; i++) { + struct ir3_register *reg = instr->regs[i]; + if (reg->flags & IR3_REG_SSA) { + if (is_deref(reg->instr)) { + struct ir3_instruction *deref; + deref = reg->instr->regs[1]->instr; /* the mova */ + if (ctx->deref == deref) + return true; + } + } + } + return false; +} + +/* when we encounter an instruction that writes to the address register + * when it is in use, we delay that instruction and try to schedule all + * other instructions using the current address register: + */ +static int block_sched_undelayed(struct ir3_sched_ctx *ctx, + struct ir3_block *block) +{ + struct ir3_instruction *instr = block->head; + bool in_use = false; + unsigned cnt = ~0; + + while (instr) { + struct ir3_instruction *next = instr->next; + + if (uses_current_deref(ctx, instr)) { + int ret = trysched(ctx, instr); + if (ret == SCHEDULED) + cnt = 0; + else if (ret > 0) + cnt = MIN2(cnt, ret); + in_use = true; + } + + instr = next; + } + + if (!in_use) + ctx->deref = NULL; + + return cnt; +} + static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) { struct ir3_instruction *instr; @@ -255,6 +322,10 @@ static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block) */ struct ir3_instruction *next = instr->next; int cnt = trysched(ctx, instr); + + if (cnt == DELAYED) + cnt = block_sched_undelayed(ctx, block); + /* -1 is signal to return up stack, but to us means same as 0: */ cnt = MAX2(0, cnt); cnt += ctx->cnt; |