summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2014-10-03 18:13:05 -0700
committerJason Ekstrand <[email protected]>2014-10-24 16:25:31 -0700
commit5d1046291a13a1be4bfaeed0fb9d758f877e927e (patch)
tree77df332f764c0c329959830c24c13880dd68270d
parent2ec161b2396b08341264965a5825152784b54549 (diff)
i965/fs: Compute q-values for register allocation manually
Previously, we were allowing the register allocation code to do the computation for us in ra_set_finalize. However, the runtime for this computation is O(c^4 * g) where c is the number of classes and g is the number of GRF registers. However, these q-values are directly computable based on the way we lay out our register classes so there is no need for the aweful runtime algorithm. We were doing ok until commit 7210583eb where we bumped the number of register classes from 11 to 16. While startup times don't normally matter, this caused piglit to take 4 times as long to run on Bay Trail. This patch should make generating the ra_set much faster and melt the piglit run times. v2: Fixed a couple of bugs. I have now verified that the same q-values are generated both ways. Signed-off-by: Jason Ekstrand <[email protected]> Reviewed-by: Matt Turner <[email protected]>
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp58
1 files changed, 56 insertions, 2 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 0c4888fbd57..7ae6c75a952 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -152,6 +152,13 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
int *classes = ralloc_array(screen, int, class_count);
int aligned_pairs_class = -1;
+ /* Allocate space for q values. We allocate class_count + 1 because we
+ * want to leave room for the aligned pairs class if we have it. */
+ unsigned int **q_values = ralloc_array(screen, unsigned int *,
+ class_count + 1);
+ for (int i = 0; i < class_count + 1; ++i)
+ q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
+
/* Now, add the registers to their classes, and add the conflicts
* between them and the base GRF registers (and also each other).
*/
@@ -162,8 +169,41 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
int class_reg_count;
if (devinfo->gen <= 5 && reg_width == 2) {
class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+
+ /* See comment below. The only difference here is that we are
+ * dealing with pairs of registers instead of single registers.
+ * Registers of odd sizes simply get rounded up. */
+ for (int j = 0; j < class_count; j++)
+ q_values[i][j] = (class_sizes[i] + 1) / 2 +
+ (class_sizes[j] + 1) / 2 - 1;
} else {
class_reg_count = base_reg_count - (class_sizes[i] - 1);
+
+ /* From register_allocate.c:
+ *
+ * q(B,C) (indexed by C, B is this register class) in
+ * Runeson/Nyström paper. This is "how many registers of B could
+ * the worst choice register from C conflict with".
+ *
+ * If we just let the register allocation algorithm compute these
+ * values, is extremely expensive. However, since all of our
+ * registers are laid out, we can very easily compute them
+ * ourselves. View the register from C as fixed starting at GRF n
+ * somwhere in the middle, and the register from B as sliding back
+ * and forth. Then the first register to conflict from B is the
+ * one starting at n - class_size[B] + 1 and the last register to
+ * conflict will start at n + class_size[B] - 1. Therefore, the
+ * number of conflicts from B is class_size[B] + class_size[C] - 1.
+ *
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * B | | | | | |n| --> | | | | | | |
+ * +-+-+-+-+-+-+ +-+-+-+-+-+-+
+ * +-+-+-+-+-+
+ * C |n| | | | |
+ * +-+-+-+-+-+
+ */
+ for (int j = 0; j < class_count; j++)
+ q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
}
classes[i] = ra_alloc_reg_class(regs);
@@ -208,7 +248,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
/* Add a special class for aligned pairs, which we'll put delta_x/y
* in on gen5 so that we can do PLN.
*/
- if (devinfo->has_pln && devinfo->gen < 6) {
+ if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
aligned_pairs_class = ra_alloc_reg_class(regs);
for (int i = 0; i < pairs_reg_count; i++) {
@@ -216,9 +256,23 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
}
}
+
+ for (int i = 0; i < class_count; i++) {
+ /* These are a little counter-intuitive because the pair registers
+ * are required to be aligned while the register they are
+ * potentially interferring with are not. In the case where the
+ * size is even, the worst-case is that the register is
+ * odd-aligned. In the odd-size case, it doesn't matter.
+ */
+ q_values[class_count][i] = class_sizes[i] / 2 + 1;
+ q_values[i][class_count] = class_sizes[i] + 1;
+ }
+ q_values[class_count][class_count] = 1;
}
- ra_set_finalize(regs, NULL);
+ ra_set_finalize(regs, q_values);
+
+ ralloc_free(q_values);
screen->wm_reg_sets[index].regs = regs;
for (unsigned i = 0; i < ARRAY_SIZE(screen->wm_reg_sets[index].classes); i++)