summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2019-05-09 10:01:06 -0500
committerJason Ekstrand <[email protected]>2019-05-14 12:30:22 -0500
commit41b310e2196a89a2fdd05509f8160b207d0e4d9b (patch)
tree064d99bfb58f79b8b7f9109fbf0dfa35e6a3f11d /src
parente1511f1d4c3949b107fda284ea9117632b9ef61d (diff)
util/ra: Improve the performance of ra_simplify
The most expensive part of register allocation is the ra_simplify step which is a fixed-point algorithm with a worst-case complexity of O(n^2) which adds the registers to a stack which we then use later to do the actual allocation. This commit uses bit sets and changes the core loop of ra_simplify to first walk 32-node chunks and then walk each chunk. This lets us skip whole 32-node chunks in one go based on bit operations and compute the minimum q value potentially 32x as fast. Of course, the algorithm still has the same fundamental O(n^2) run-time but the constant is now much lower. In the nasty Aztec Ruins compute shader, this shaves a full four seconds off the 30s compile time for a release build of mesa. In a debug build (needed for accurate stack traces), perf says that ra_select takes 20% of runtime before this patch and only 5-6% of runtime after this patch. It also makes shader-db runs faster. Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2602.37 -> 2524.31 (-3.00%) Reviewed-by: Eric Anholt <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/util/register_allocate.c149
1 files changed, 119 insertions, 30 deletions
diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 0c62ad3865c..4ab0a50566f 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -163,6 +163,21 @@ struct ra_graph {
/** Bit-set indicating, for each register, if it's in the stack */
BITSET_WORD *in_stack;
+ /** Bit-set indicating, for each register, if it pre-assigned */
+ BITSET_WORD *reg_assigned;
+
+ /** Bit-set indicating, for each register, the value of the pq test */
+ BITSET_WORD *pq_test;
+
+ /** For each BITSET_WORD, the minimum q value or ~0 if unknown */
+ unsigned int *min_q_total;
+
+ /*
+ * * For each BITSET_WORD, the node with the minimum q_total if
+ * min_q_total[i] != ~0.
+ */
+ unsigned int *min_q_node;
+
/**
* Tracks the start of the set of optimistically-colored registers in the
* stack.
@@ -424,6 +439,11 @@ ra_alloc_interference_graph(struct ra_regs *regs, unsigned int count)
int bitset_count = BITSET_WORDS(count);
g->in_stack = rzalloc_array(g, BITSET_WORD, bitset_count);
+ g->reg_assigned = rzalloc_array(g, BITSET_WORD, bitset_count);
+ g->pq_test = rzalloc_array(g, BITSET_WORD, bitset_count);
+ g->min_q_total = rzalloc_array(g, unsigned int, bitset_count);
+ g->min_q_node = rzalloc_array(g, unsigned int, bitset_count);
+
for (i = 0; i < count; i++) {
g->nodes[i].adjacency = rzalloc_array(g, BITSET_WORD, bitset_count);
@@ -466,29 +486,54 @@ ra_add_node_interference(struct ra_graph *g,
}
}
-static bool
-pq_test(struct ra_graph *g, unsigned int n)
+static void
+update_pq_info(struct ra_graph *g, unsigned int n)
{
+ int i = n / BITSET_WORDBITS;
int n_class = g->nodes[n].class;
-
- return g->nodes[n].q_total < g->regs->classes[n_class]->p;
+ if (g->nodes[n].q_total < g->regs->classes[n_class]->p) {
+ BITSET_SET(g->pq_test, n);
+ } else if (g->min_q_total[i] != UINT_MAX) {
+ /* Only update min_q_total and min_q_node if min_q_total != UINT_MAX so
+ * that we don't update while we have stale data and accidentally mark
+ * it as non-stale. Also, in order to remain consistent with the old
+ * naive implementation of the algorithm, we do a lexicographical sort
+ * to ensure that we always choose the node with the highest node index.
+ */
+ if (g->nodes[n].q_total < g->min_q_total[i] ||
+ (g->nodes[n].q_total == g->min_q_total[i] &&
+ n > g->min_q_node[i])) {
+ g->min_q_total[i] = g->nodes[n].q_total;
+ g->min_q_node[i] = n;
+ }
+ }
}
static void
-decrement_q(struct ra_graph *g, unsigned int n)
+add_node_to_stack(struct ra_graph *g, unsigned int n)
{
unsigned int i;
int n_class = g->nodes[n].class;
+ assert(!BITSET_TEST(g->in_stack, n));
+
for (i = 0; i < g->nodes[n].adjacency_count; i++) {
unsigned int n2 = g->nodes[n].adjacency_list[i];
unsigned int n2_class = g->nodes[n2].class;
- if (!BITSET_TEST(g->in_stack, n2) && g->nodes[n2].reg == NO_REG) {
+ if (!BITSET_TEST(g->in_stack, n2) && !BITSET_TEST(g->reg_assigned, n2)) {
assert(g->nodes[n2].q_total >= g->regs->classes[n2_class]->q[n_class]);
g->nodes[n2].q_total -= g->regs->classes[n2_class]->q[n_class];
+ update_pq_info(g, n2);
}
}
+
+ g->stack[g->stack_count] = n;
+ g->stack_count++;
+ BITSET_SET(g->in_stack, n);
+
+ /* Flag the min_q_total for n's block as dirty so it gets recalculated */
+ g->min_q_total[n / BITSET_WORDBITS] = UINT_MAX;
}
/**
@@ -506,45 +551,89 @@ ra_simplify(struct ra_graph *g)
{
bool progress = true;
unsigned int stack_optimistic_start = UINT_MAX;
- int i;
+
+ /* Figure out the high bit and bit mask for the first iteration of a loop
+ * over BITSET_WORDs.
+ */
+ const unsigned int top_word_high_bit = (g->count - 1) % BITSET_WORDBITS;
+
+ /* Do a quick pre-pass to set things up */
+ for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
+ i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
+ g->min_q_total[i] = UINT_MAX;
+ g->min_q_node[i] = UINT_MAX;
+ for (int j = high_bit; j >= 0; j--) {
+ unsigned int n = i * BITSET_WORDBITS + j;
+ if (g->nodes[n].reg != NO_REG)
+ g->reg_assigned[i] |= BITSET_BIT(j);
+ update_pq_info(g, n);
+ }
+ }
while (progress) {
- unsigned int best_optimistic_node = ~0;
- unsigned int lowest_q_total = ~0;
+ unsigned int min_q_total = UINT_MAX;
+ unsigned int min_q_node = UINT_MAX;
progress = false;
- for (i = g->count - 1; i >= 0; i--) {
- if (BITSET_TEST(g->in_stack, i) || g->nodes[i].reg != NO_REG)
+ for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
+ i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
+ BITSET_WORD mask = ~(BITSET_WORD)0 >> (31 - high_bit);
+
+ BITSET_WORD skip = g->in_stack[i] | g->reg_assigned[i];
+ if (skip == mask)
continue;
- if (pq_test(g, i)) {
- decrement_q(g, i);
- g->stack[g->stack_count] = i;
- g->stack_count++;
- BITSET_SET(g->in_stack, i);
- progress = true;
- } else if (!progress) {
- /* We only need to do this if we haven't made progress. If we
- * have made progress, we'll throw the data away and loop again
- * anyway.
+ BITSET_WORD pq = g->pq_test[i] & ~skip;
+ if (pq) {
+ /* In this case, we have stuff we can immediately take off the
+ * stack. This also means that we're guaranteed to make progress
+ * and we don't need to bother updating lowest_q_total because we
+ * know we're going to loop again before attempting to do anything
+ * optimistic.
*/
- unsigned int new_q_total = g->nodes[i].q_total;
- if (new_q_total < lowest_q_total) {
- best_optimistic_node = i;
- lowest_q_total = new_q_total;
+ for (int j = high_bit; j >= 0; j--) {
+ if (pq & BITSET_BIT(j)) {
+ unsigned int n = i * BITSET_WORDBITS + j;
+ assert(n < g->count);
+ add_node_to_stack(g, n);
+ /* add_node_to_stack() may update pq_test for this word so
+ * we need to update our local copy.
+ */
+ pq = g->pq_test[i] & ~skip;
+ progress = true;
+ }
+ }
+ } else if (!progress) {
+ if (g->min_q_total[i] == UINT_MAX) {
+ /* The min_q_total and min_q_node are dirty because we added
+ * one of these nodes to the stack. It needs to be
+ * recalculated.
+ */
+ for (int j = high_bit; j >= 0; j--) {
+ if (skip & BITSET_BIT(j))
+ continue;
+
+ unsigned int n = i * BITSET_WORDBITS + j;
+ assert(n < g->count);
+ if (g->nodes[n].q_total < g->min_q_total[i]) {
+ g->min_q_total[i] = g->nodes[n].q_total;
+ g->min_q_node[i] = n;
+ }
+ }
+ }
+ if (g->min_q_total[i] < min_q_total) {
+ min_q_node = g->min_q_node[i];
+ min_q_total = g->min_q_total[i];
}
}
}
- if (!progress && best_optimistic_node != ~0U) {
+ if (!progress && min_q_total != UINT_MAX) {
if (stack_optimistic_start == UINT_MAX)
stack_optimistic_start = g->stack_count;
- decrement_q(g, best_optimistic_node);
- g->stack[g->stack_count] = best_optimistic_node;
- g->stack_count++;
- BITSET_SET(g->in_stack, best_optimistic_node);
+ add_node_to_stack(g, min_q_node);
progress = true;
}
}