diff options
author | Connor Abbott <[email protected]> | 2018-01-11 18:35:58 -0500 |
---|---|---|
committer | Connor Abbott <[email protected]> | 2019-07-18 14:33:23 +0200 |
commit | 54434fe67068d9abdf73bfc3482337cc4315d2d3 (patch) | |
tree | 70ae5fa227540a1f2aa450190cd1a41378c7d843 /src/gallium/drivers/lima/ir/gp/instr.c | |
parent | 12645e8714d92f40b0da9d2e70022f7444860303 (diff) |
lima/gpir: Rework the scheduler
Now, we do scheduling at the same time as value register allocation. The
ready list now acts similarly to the array of registers in
value_regalloc, keeping us from running out of slots. Before this, the
value register allocator wasn't aware of the scheduling constraints of
the actual machine, which meant that it sometimes chose the wrong false
dependencies to insert. Now, we assign value registers at the same time
as we actually schedule instructions, making its choices reflect reality
much better. It was also conservative in some cases where the new scheme
doesn't have to be. For example, in something like:
1 = ld_att
2 = ld_uni
3 = add 1, 2
It's possible that one of 1 and 2 can't be scheduled in the same
instruction as 3, meaning that a move needs to be inserted, so the value
register allocator needs to assume that this sequence requires two
registers. But when actually scheduling, we could discover that 1, 2,
and 3 can all be scheduled together, so that they only require one
register. The new scheduler speculatively inserts the instruction under
consideration, as well as all of its child load instructions, and then
counts the number of live value registers after all is said and done.
This lets us be more aggressive with scheduling when we're close to the
limit.
With the new scheduler, the kmscube vertex shader is now scheduled in 40
instructions, versus 66 before.
Acked-by: Qiang Yu <[email protected]>
Diffstat (limited to 'src/gallium/drivers/lima/ir/gp/instr.c')
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/instr.c | 80 |
1 files changed, 59 insertions, 21 deletions
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c index d3190d39d4c..228d1459280 100644 --- a/src/gallium/drivers/lima/ir/gp/instr.c +++ b/src/gallium/drivers/lima/ir/gp/instr.c @@ -36,6 +36,7 @@ gpir_instr *gpir_instr_create(gpir_block *block) instr->index = block->sched.instr_index++; instr->alu_num_slot_free = 6; + instr->alu_non_cplx_slot_free = 5; list_add(&instr->list, &block->instr_list); return instr; @@ -85,6 +86,11 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) return false; int consume_slot = gpir_instr_get_consume_slot(instr, node); + int non_cplx_consume_slot = + node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot; + int store_reduce_slot = 0; + int max_reduce_slot = node->sched.max_node ? 1 : 0; + int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0; /* check if this node is child of one store node. * complex1 won't be any of this instr's store node's child, @@ -93,25 +99,40 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) for (int i = GPIR_INSTR_SLOT_STORE0; i <= GPIR_INSTR_SLOT_STORE3; i++) { gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { - /* acc node may consume 2 slots, so even it's the child of a - * store node, it may not be inserted successfully, in which - * case we need a move node for it */ - if (instr->alu_num_slot_free - consume_slot < - instr->alu_num_slot_needed_by_store - 1) - return false; - - instr->alu_num_slot_needed_by_store--; - instr->alu_num_slot_free -= consume_slot; - return true; + store_reduce_slot = 1; + break; } } - /* not a child of any store node, so must reserve alu slot for store node */ - if (instr->alu_num_slot_free - consume_slot < - instr->alu_num_slot_needed_by_store) + /* Check that the invariants will be maintained after we adjust everything + */ + + int slot_difference = + instr->alu_num_slot_needed_by_store - store_reduce_slot + + instr->alu_num_slot_needed_by_max - max_reduce_slot + + MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) - + (instr->alu_num_slot_free - consume_slot); + if (slot_difference > 0) { + gpir_debug("failed %d because of alu slot\n", node->index); + instr->slot_difference = slot_difference; + } + + int non_cplx_slot_difference = + instr->alu_num_slot_needed_by_max - max_reduce_slot - + (instr->alu_non_cplx_slot_free - non_cplx_consume_slot); + if (non_cplx_slot_difference > 0) { + gpir_debug("failed %d because of alu slot\n", node->index); + instr->non_cplx_slot_difference = non_cplx_slot_difference; + } + + if (slot_difference > 0 || non_cplx_slot_difference > 0) return false; instr->alu_num_slot_free -= consume_slot; + instr->alu_non_cplx_slot_free -= non_cplx_consume_slot; + instr->alu_num_slot_needed_by_store -= store_reduce_slot; + instr->alu_num_slot_needed_by_max -= max_reduce_slot; + instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot; return true; } @@ -123,12 +144,17 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node) gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { instr->alu_num_slot_needed_by_store++; - instr->alu_num_slot_free += consume_slot; - return; + break; } } instr->alu_num_slot_free += consume_slot; + if (node->sched.pos != GPIR_INSTR_SLOT_COMPLEX) + instr->alu_non_cplx_slot_free += consume_slot; + if (node->sched.max_node) + instr->alu_num_slot_needed_by_max++; + if (node->sched.next_max_node) + instr->alu_num_slot_needed_by_next_max++; } static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node) @@ -269,12 +295,18 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node) goto out; } - /* no store node has the same child as this node, and child is not - * already in this instr's alu slot, so instr must have some free - * alu slot to insert this node's child + /* Check the invariants documented in gpir.h, similar to the ALU case. + * Since the only thing that changes is alu_num_slot_needed_by_store, we + * can get away with just checking the first one. */ - if (gpir_instr_alu_slot_is_full(instr)) + int slot_difference = instr->alu_num_slot_needed_by_store + 1 + + instr->alu_num_slot_needed_by_max + + MAX2(instr->alu_num_slot_needed_by_next_max, 0) - + instr->alu_num_slot_free; + if (slot_difference > 0) { + instr->slot_difference = slot_difference; return false; + } instr->alu_num_slot_needed_by_store++; @@ -299,6 +331,9 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node) int other_slot = GPIR_INSTR_SLOT_STORE0 + (component ^ 1); for (int j = GPIR_INSTR_SLOT_STORE0; j <= GPIR_INSTR_SLOT_STORE3; j++) { + if (j == node->sched.pos) + continue; + gpir_store_node *s = gpir_node_to_store(instr->slots[j]); if (s && s->child == store->child) goto out; @@ -369,6 +404,9 @@ static bool gpir_instr_slot_free(gpir_instr *instr, gpir_node *node) bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node) { + instr->slot_difference = 0; + instr->non_cplx_slot_difference = 0; + if (!gpir_instr_slot_free(instr, node)) return false; @@ -413,7 +451,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node) /* This can happen if we merge duplicate loads in the scheduler. */ if (instr->slots[node->sched.pos] != node) { node->sched.pos = -1; - node->sched.instr = -1; + node->sched.instr = NULL; return; } @@ -439,7 +477,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node) instr->slots[GPIR_INSTR_SLOT_MUL1] = NULL; node->sched.pos = -1; - node->sched.instr = -1; + node->sched.instr = NULL; } void gpir_instr_print_prog(gpir_compiler *comp) |