summaryrefslogtreecommitdiffstats
path: root/src/freedreno
diff options
context:
space:
mode:
authorRob Clark <[email protected]>2020-02-28 12:48:16 -0800
committerMarge Bot <[email protected]>2020-03-10 16:01:39 +0000
commitcc82521de4e8e85022a5facb1b5f52d5139d3022 (patch)
tree91240bc3a7e3e264372a74f86319c1710060573d /src/freedreno
parentb2b349096f03803b974d1d942cfff37f77325bee (diff)
freedreno/ir3: round-robin RA
In the second (scalar pass) use the information about # of registers used in the first pass as the target max, and round-robin within that range. This generally gives the post-RA sched pass more opportunities to re-order instructions to remove nop's. Also, we can be a bit clever when assigning dest registers for SFU instructions, by picking the register used for it's src (if available and already assigned). This avoids some (ss) syncs caused by write after read hazards. (Ie. the SFU instruction will read it's own src before writing dest.) Signed-off-by: Rob Clark <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4071>
Diffstat (limited to 'src/freedreno')
-rw-r--r--src/freedreno/ir3/ir3_ra.c167
1 files changed, 163 insertions, 4 deletions
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 39507184105..05bcdcc60b1 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -346,6 +346,9 @@ struct ir3_ra_ctx {
unsigned *def, *use; /* def/use table */
struct ir3_ra_instr_data *instrd;
+ /* Mapping vreg name back to instruction, used select reg callback: */
+ struct hash_table *name_to_instr;
+
/* Tracking for max half/full register assigned. We don't need to
* track high registers.
*
@@ -354,8 +357,14 @@ struct ir3_ra_ctx {
*/
unsigned max_assigned;
unsigned max_half_assigned;
+
+ /* Tracking for select_reg callback */
+ unsigned start_search_reg;
+ unsigned max_target;
};
+static int scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n);
+
/* does it conflict? */
static inline bool
intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
@@ -640,6 +649,101 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
}
}
+static int
+pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max)
+{
+ for (unsigned i = min; i < max; i++) {
+ if (BITSET_TEST(regs, i)) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/* register selector for the a6xx+ merged register file: */
+static unsigned int
+ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data)
+{
+ struct ir3_ra_ctx *ctx = data;
+ unsigned int class = ra_get_node_class(ctx->g, n);
+
+ /* dimensions within the register class: */
+ unsigned max_target, start;
+
+ /* the regs bitset will include *all* of the virtual regs, but we lay
+ * out the different classes consecutively in the virtual register
+ * space. So we just need to think about the base offset of a given
+ * class within the virtual register space, and offset the register
+ * space we search within by that base offset.
+ */
+ unsigned base;
+
+ /* NOTE: this is only used in scalar pass, so the register
+ * class will be one of the scalar classes (ie. idx==0):
+ */
+ if (class == ctx->set->high_classes[0]) {
+ max_target = HIGH_CLASS_REGS(0);
+ start = 0;
+ base = ctx->set->gpr_to_ra_reg[HIGH_OFFSET][0];
+ } else if (class == ctx->set->half_classes[0]) {
+ max_target = ctx->max_target;
+ start = ctx->start_search_reg;
+ base = ctx->set->gpr_to_ra_reg[HALF_OFFSET][0];
+ } else if (class == ctx->set->classes[0]) {
+ max_target = ctx->max_target / 2;
+ start = ctx->start_search_reg;
+ base = ctx->set->gpr_to_ra_reg[0][0];
+ } else {
+ unreachable("unexpected register class!");
+ }
+
+ /* For cat4 instructions, if the src reg is already assigned, and
+ * avail to pick, use it. Because this doesn't introduce unnecessary
+ * dependencies, and it potentially avoids needing (ss) syncs to
+ * for write after read hazards:
+ */
+ struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &n);
+ if (entry) {
+ struct ir3_instruction *instr = entry->data;
+
+ if (is_sfu(instr) && instr->regs[1]->instr) {
+ struct ir3_instruction *src = instr->regs[1]->instr;
+ unsigned src_n = scalar_name(ctx, src, 0);
+
+ unsigned reg = ra_get_node_reg(ctx->g, src_n);
+
+ /* Check if the src register has been assigned yet: */
+ if (reg != NO_REG) {
+ if (BITSET_TEST(regs, reg)) {
+ return reg;
+ }
+ }
+ }
+ }
+
+ int r = pick_in_range(regs, base + start, base + max_target);
+ if (r < 0) {
+ /* wrap-around: */
+ r = pick_in_range(regs, base, base + start);
+ }
+
+ if (r < 0) {
+ /* overflow, we need to increase max_target: */
+ ctx->max_target++;
+ return ra_select_reg_merged(n, regs, data);
+ }
+
+ if (class == ctx->set->half_classes[0]) {
+ int n = r - base;
+ ctx->start_search_reg = (n + 1) % ctx->max_target;
+ } else if (class == ctx->set->classes[0]) {
+ int n = (r - base) * 2;
+ ctx->start_search_reg = (n + 1) % ctx->max_target;
+ }
+
+ return r;
+}
+
static void
ra_init(struct ir3_ra_ctx *ctx)
{
@@ -680,6 +784,14 @@ ra_init(struct ir3_ra_ctx *ctx)
ralloc_steal(ctx->g, ctx->instrd);
ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+
+ /* TODO add selector callback for split (pre-a6xx) register file: */
+ if (ctx->scalar_pass && (ctx->ir->compiler->gpu_id >= 600)) {
+ ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx);
+
+ ctx->name_to_instr = _mesa_hash_table_create(ctx->g,
+ _mesa_hash_int, _mesa_key_int_equal);
+ }
}
static unsigned
@@ -837,6 +949,16 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
def(name, instr);
+ if (ctx->name_to_instr && is_sfu(instr)) {
+ /* this is slightly annoying, we can't just use an
+ * integer on the stack
+ */
+ unsigned *key = ralloc(ctx->name_to_instr, unsigned);
+ *key = name;
+ debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key));
+ _mesa_hash_table_insert(ctx->name_to_instr, key, instr);
+ }
+
if ((instr->opc == OPC_META_INPUT) && first_non_input)
use(name, first_non_input);
@@ -1536,9 +1658,32 @@ ra_sanity_check(struct ir3 *ir)
}
}
+/* Target is calculated in terms of half-regs (with a full reg
+ * consisting of two half-regs).
+ */
+static void
+ra_calc_merged_register_target(struct ir3_ra_ctx *ctx)
+{
+ const unsigned vec4 = 2 * 4; // 8 half-regs
+ unsigned t = MAX2(2 * ctx->max_assigned, ctx->max_half_assigned);
+
+ /* second RA pass may have saved some regs, let's try to reclaim
+ * the benefit by adjusting the target downwards slightly:
+ */
+ if (ir3_has_latency_to_hide(ctx->ir)) {
+ if (t > 8 * vec4) {
+ t -= 2 * vec4;
+ } else if (t > 6 * vec4) {
+ t -= vec4;
+ }
+ }
+
+ ctx->max_target = t;
+}
+
static int
ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
- unsigned nprecolor, bool scalar_pass)
+ unsigned nprecolor, bool scalar_pass, unsigned *target)
{
struct ir3_ra_ctx ctx = {
.v = v,
@@ -1548,6 +1693,10 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
};
int ret;
+ if (scalar_pass) {
+ ctx.max_target = *target;
+ }
+
ra_init(&ctx);
ra_add_interference(&ctx);
ra_precolor(&ctx, precolor, nprecolor);
@@ -1556,7 +1705,16 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
ret = ra_alloc(&ctx);
ra_destroy(&ctx);
- printf("#### max_assigned=%u, max_half_assigned=%u\n", ctx.max_assigned, ctx.max_half_assigned);
+ /* In the first pass, calculate the target register usage used in the
+ * second (scalar) pass:
+ */
+ if (!scalar_pass) {
+ /* TODO: round-robin support for pre-a6xx: */
+ if (ctx.ir->compiler->gpu_id >= 600) {
+ ra_calc_merged_register_target(&ctx);
+ }
+ *target = ctx.max_target;
+ }
return ret;
}
@@ -1565,10 +1723,11 @@ int
ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
unsigned nprecolor)
{
+ unsigned target = 0;
int ret;
/* First pass, assign the vecN (non-scalar) registers: */
- ret = ir3_ra_pass(v, precolor, nprecolor, false);
+ ret = ir3_ra_pass(v, precolor, nprecolor, false, &target);
if (ret)
return ret;
@@ -1578,7 +1737,7 @@ ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
}
/* Second pass, assign the scalar registers: */
- ret = ir3_ra_pass(v, precolor, nprecolor, true);
+ ret = ir3_ra_pass(v, precolor, nprecolor, true, &target);
if (ret)
return ret;