aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/lima/ir/gp/instr.c
diff options
context:
space:
mode:
authorConnor Abbott <[email protected]>2018-01-11 18:35:58 -0500
committerConnor Abbott <[email protected]>2019-07-18 14:33:23 +0200
commit54434fe67068d9abdf73bfc3482337cc4315d2d3 (patch)
tree70ae5fa227540a1f2aa450190cd1a41378c7d843 /src/gallium/drivers/lima/ir/gp/instr.c
parent12645e8714d92f40b0da9d2e70022f7444860303 (diff)
lima/gpir: Rework the scheduler
Now, we do scheduling at the same time as value register allocation. The ready list now acts similarly to the array of registers in value_regalloc, keeping us from running out of slots. Before this, the value register allocator wasn't aware of the scheduling constraints of the actual machine, which meant that it sometimes chose the wrong false dependencies to insert. Now, we assign value registers at the same time as we actually schedule instructions, making its choices reflect reality much better. It was also conservative in some cases where the new scheme doesn't have to be. For example, in something like: 1 = ld_att 2 = ld_uni 3 = add 1, 2 It's possible that one of 1 and 2 can't be scheduled in the same instruction as 3, meaning that a move needs to be inserted, so the value register allocator needs to assume that this sequence requires two registers. But when actually scheduling, we could discover that 1, 2, and 3 can all be scheduled together, so that they only require one register. The new scheduler speculatively inserts the instruction under consideration, as well as all of its child load instructions, and then counts the number of live value registers after all is said and done. This lets us be more aggressive with scheduling when we're close to the limit. With the new scheduler, the kmscube vertex shader is now scheduled in 40 instructions, versus 66 before. Acked-by: Qiang Yu <[email protected]>
Diffstat (limited to 'src/gallium/drivers/lima/ir/gp/instr.c')
-rw-r--r--src/gallium/drivers/lima/ir/gp/instr.c80
1 files changed, 59 insertions, 21 deletions
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index d3190d39d4c..228d1459280 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -36,6 +36,7 @@ gpir_instr *gpir_instr_create(gpir_block *block)
instr->index = block->sched.instr_index++;
instr->alu_num_slot_free = 6;
+ instr->alu_non_cplx_slot_free = 5;
list_add(&instr->list, &block->instr_list);
return instr;
@@ -85,6 +86,11 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
return false;
int consume_slot = gpir_instr_get_consume_slot(instr, node);
+ int non_cplx_consume_slot =
+ node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
+ int store_reduce_slot = 0;
+ int max_reduce_slot = node->sched.max_node ? 1 : 0;
+ int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
/* check if this node is child of one store node.
* complex1 won't be any of this instr's store node's child,
@@ -93,25 +99,40 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
for (int i = GPIR_INSTR_SLOT_STORE0; i <= GPIR_INSTR_SLOT_STORE3; i++) {
gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
if (s && s->child == node) {
- /* acc node may consume 2 slots, so even it's the child of a
- * store node, it may not be inserted successfully, in which
- * case we need a move node for it */
- if (instr->alu_num_slot_free - consume_slot <
- instr->alu_num_slot_needed_by_store - 1)
- return false;
-
- instr->alu_num_slot_needed_by_store--;
- instr->alu_num_slot_free -= consume_slot;
- return true;
+ store_reduce_slot = 1;
+ break;
}
}
- /* not a child of any store node, so must reserve alu slot for store node */
- if (instr->alu_num_slot_free - consume_slot <
- instr->alu_num_slot_needed_by_store)
+ /* Check that the invariants will be maintained after we adjust everything
+ */
+
+ int slot_difference =
+ instr->alu_num_slot_needed_by_store - store_reduce_slot +
+ instr->alu_num_slot_needed_by_max - max_reduce_slot +
+ MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) -
+ (instr->alu_num_slot_free - consume_slot);
+ if (slot_difference > 0) {
+ gpir_debug("failed %d because of alu slot\n", node->index);
+ instr->slot_difference = slot_difference;
+ }
+
+ int non_cplx_slot_difference =
+ instr->alu_num_slot_needed_by_max - max_reduce_slot -
+ (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
+ if (non_cplx_slot_difference > 0) {
+ gpir_debug("failed %d because of alu slot\n", node->index);
+ instr->non_cplx_slot_difference = non_cplx_slot_difference;
+ }
+
+ if (slot_difference > 0 || non_cplx_slot_difference > 0)
return false;
instr->alu_num_slot_free -= consume_slot;
+ instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
+ instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+ instr->alu_num_slot_needed_by_max -= max_reduce_slot;
+ instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
return true;
}
@@ -123,12 +144,17 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
if (s && s->child == node) {
instr->alu_num_slot_needed_by_store++;
- instr->alu_num_slot_free += consume_slot;
- return;
+ break;
}
}
instr->alu_num_slot_free += consume_slot;
+ if (node->sched.pos != GPIR_INSTR_SLOT_COMPLEX)
+ instr->alu_non_cplx_slot_free += consume_slot;
+ if (node->sched.max_node)
+ instr->alu_num_slot_needed_by_max++;
+ if (node->sched.next_max_node)
+ instr->alu_num_slot_needed_by_next_max++;
}
static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node)
@@ -269,12 +295,18 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
goto out;
}
- /* no store node has the same child as this node, and child is not
- * already in this instr's alu slot, so instr must have some free
- * alu slot to insert this node's child
+ /* Check the invariants documented in gpir.h, similar to the ALU case.
+ * Since the only thing that changes is alu_num_slot_needed_by_store, we
+ * can get away with just checking the first one.
*/
- if (gpir_instr_alu_slot_is_full(instr))
+ int slot_difference = instr->alu_num_slot_needed_by_store + 1
+ + instr->alu_num_slot_needed_by_max +
+ MAX2(instr->alu_num_slot_needed_by_next_max, 0) -
+ instr->alu_num_slot_free;
+ if (slot_difference > 0) {
+ instr->slot_difference = slot_difference;
return false;
+ }
instr->alu_num_slot_needed_by_store++;
@@ -299,6 +331,9 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
int other_slot = GPIR_INSTR_SLOT_STORE0 + (component ^ 1);
for (int j = GPIR_INSTR_SLOT_STORE0; j <= GPIR_INSTR_SLOT_STORE3; j++) {
+ if (j == node->sched.pos)
+ continue;
+
gpir_store_node *s = gpir_node_to_store(instr->slots[j]);
if (s && s->child == store->child)
goto out;
@@ -369,6 +404,9 @@ static bool gpir_instr_slot_free(gpir_instr *instr, gpir_node *node)
bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node)
{
+ instr->slot_difference = 0;
+ instr->non_cplx_slot_difference = 0;
+
if (!gpir_instr_slot_free(instr, node))
return false;
@@ -413,7 +451,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
/* This can happen if we merge duplicate loads in the scheduler. */
if (instr->slots[node->sched.pos] != node) {
node->sched.pos = -1;
- node->sched.instr = -1;
+ node->sched.instr = NULL;
return;
}
@@ -439,7 +477,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
instr->slots[GPIR_INSTR_SLOT_MUL1] = NULL;
node->sched.pos = -1;
- node->sched.instr = -1;
+ node->sched.instr = NULL;
}
void gpir_instr_print_prog(gpir_compiler *comp)