vc4: Prioritize allocating accumulators to short-lived values.

The register allocator walks from the end of the nodes array looking for trivially-allocatable things to put on the stack, meaning (assuming everything is trivially colorable and gets put on the stack in a single pass) the low node numbers get allocated first. The things allocated first happen to get the lower-numbered registers, which is to say the fast accumulators that can be paired more easily. When we previously made the nodes match the temporary register numbers, we'd end up putting the shader inputs (VS or FS) in the accumulators, which are often long-lived values. By prioritizing the shortest-lived values for allocation, we can get a lot more instructions that involve accumulators, and thus fewer conflicts for raddr and WS. total instructions in shared programs: 52870 -> 46428 (-12.18%) instructions in affected programs: 52260 -> 45818 (-12.33%)
author: Eric Anholt <[email protected]> 2014-12-08 17:43:29 -0800
committer: Eric Anholt <[email protected]> 2014-12-09 00:55:14 -0800
commit: ab1b1fa6fbd72b05c48f83c9df5036c2bfe893a3 (patch)
tree: 05d14561a1c6accf8c2450811434a6e545c6c289
parent: 0d4272cd8e7c45157140dc8e283707714a8238d5 (diff)
1 files changed, 59 insertions, 14 deletions
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index b62669feb30..3001900c074 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -139,6 +139,20 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         ra_set_finalize(vc4->regs, NULL);
 }
 
+struct node_to_temp_map {
+        uint32_t temp;
+        uint32_t priority;
+};
+
+static int
+node_to_temp_priority(const void *in_a, const void *in_b)
+{
+        const struct node_to_temp_map *a = in_a;
+        const struct node_to_temp_map *b = in_b;
+
+        return a->priority - b->priority;
+}
+
 /**
  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
  *
@@ -148,6 +162,8 @@ struct qpu_reg *
 vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 {
         struct simple_node *node;
+        struct node_to_temp_map map[c->num_temps];
+        uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
         uint32_t use[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
@@ -166,11 +182,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        for (uint32_t i = 0; i < c->num_temps; i++)
+        for (uint32_t i = 0; i < c->num_temps; i++) {
                 ra_set_node_class(g, i, vc4->reg_class_any);
+        }
 
-        /* Compute the live ranges so we can figure out interference, and
-         * figure out our register classes and preallocated registers.
+        /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
         foreach(node, &c->instructions) {
@@ -188,27 +204,54 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 
                 switch (inst->op) {
                 case QOP_FRAG_Z:
+                case QOP_FRAG_W:
+                        /* The payload registers have values implicitly loaded
+                         * at the start of the program.
+                         */
                         def[inst->dst.index] = 0;
-                        ra_set_node_reg(g, inst->dst.index,
+                        break;
+                default:
+                        break;
+                }
+
+                ip++;
+        }
+
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                map[i].temp = i;
+                map[i].priority = use[i] - def[i];
+        }
+        qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
+        for (uint32_t i = 0; i < c->num_temps; i++) {
+                temp_to_node[map[i].temp] = i;
+        }
+
+        /* Figure out our register classes and preallocated registers*/
+        foreach(node, &c->instructions) {
+                struct qinst *inst = (struct qinst *)node;
+
+                switch (inst->op) {
+                case QOP_FRAG_Z:
+                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
                         break;
 
                 case QOP_FRAG_W:
-                        def[inst->dst.index] = 0;
-                        ra_set_node_reg(g, inst->dst.index,
+                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
                 case QOP_TEX_RESULT:
                 case QOP_TLB_COLOR_READ:
                         assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
-                        ra_set_node_reg(g, inst->dst.index,
+                        ra_set_node_reg(g, temp_to_node[inst->dst.index],
                                         ACC_INDEX + 4);
                         break;
 
                 case QOP_PACK_SCALED:
                         /* The pack flags require an A-file dst register. */
-                        ra_set_node_class(g, inst->dst.index, vc4->reg_class_a);
+                        ra_set_node_class(g, temp_to_node[inst->dst.index],
+                                          vc4->reg_class_a);
                         break;
 
                 case QOP_UNPACK_8A:
@@ -216,20 +259,22 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_UNPACK_8C:
                 case QOP_UNPACK_8D:
                         /* The unpack flags require an A-file src register. */
-                        ra_set_node_class(g, inst->src[0].index, vc4->reg_class_a);
+                        ra_set_node_class(g, temp_to_node[inst->src[0].index],
+                                          vc4->reg_class_a);
                         break;
 
                 default:
                         break;
                 }
-
-                ip++;
         }
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
-                        if (!(def[i] >= use[j] || def[j] >= use[i]))
-                                ra_add_node_interference(g, i, j);
+                        if (!(def[i] >= use[j] || def[j] >= use[i])) {
+                                ra_add_node_interference(g,
+                                                         temp_to_node[i],
+                                                         temp_to_node[j]);
+                        }
                 }
         }
 
@@ -237,7 +282,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         assert(ok);
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
-                temp_registers[i] = vc4_regs[ra_get_node_reg(g, i)];
+                temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
 
                 /* If the value's never used, just write to the NOP register
                  * for clarity in debug output.
author	Eric Anholt <[email protected]>	2014-12-08 17:43:29 -0800
committer	Eric Anholt <[email protected]>	2014-12-09 00:55:14 -0800
commit	ab1b1fa6fbd72b05c48f83c9df5036c2bfe893a3 (patch)
tree	05d14561a1c6accf8c2450811434a6e545c6c289
parent	0d4272cd8e7c45157140dc8e283707714a8238d5 (diff)