diff options
Diffstat (limited to 'src/gallium/drivers/lima')
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/codegen.c | 8 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/gpir.h | 50 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/instr.c | 80 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/nir.c | 5 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/node.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/physical_regalloc.c | 135 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/regalloc.c (renamed from src/gallium/drivers/lima/ir/gp/value_regalloc.c) | 85 | ||||
-rw-r--r-- | src/gallium/drivers/lima/ir/gp/scheduler.c | 1379 | ||||
-rw-r--r-- | src/gallium/drivers/lima/meson.build | 3 |
9 files changed, 1187 insertions, 560 deletions
diff --git a/src/gallium/drivers/lima/ir/gp/codegen.c b/src/gallium/drivers/lima/ir/gp/codegen.c index 1b7c8903c97..9bc279e0119 100644 --- a/src/gallium/drivers/lima/ir/gp/codegen.c +++ b/src/gallium/drivers/lima/ir/gp/codegen.c @@ -76,9 +76,13 @@ static gpir_codegen_src gpir_get_alu_input(gpir_node *parent, gpir_node *child) gpir_codegen_src_load_w, gpir_codegen_src_unused, gpir_codegen_src_unused }, }; - assert(child->sched.instr - parent->sched.instr < 3); + int diff = child->sched.instr->index - parent->sched.instr->index; + assert(diff < 3); + assert(diff >= 0); - return slot_to_src[child->sched.pos][child->sched.instr - parent->sched.instr]; + int src = slot_to_src[child->sched.pos][diff]; + assert(src != gpir_codegen_src_unused); + return src; } static void gpir_codegen_mul0_slot(gpir_codegen_instr *code, gpir_instr *instr) diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h index e7f199033cd..6a1d533977b 100644 --- a/src/gallium/drivers/lima/ir/gp/gpir.h +++ b/src/gallium/drivers/lima/ir/gp/gpir.h @@ -131,8 +131,6 @@ typedef struct { GPIR_DEP_OFFSET, /* def is the offset of use (i.e. temp store) */ GPIR_DEP_READ_AFTER_WRITE, GPIR_DEP_WRITE_AFTER_READ, - GPIR_DEP_VREG_READ_AFTER_WRITE, - GPIR_DEP_VREG_WRITE_AFTER_READ, } type; /* node execute before succ */ @@ -146,6 +144,9 @@ typedef struct { struct list_head succ_link; } gpir_dep; +struct gpir_instr; +struct gpir_store_node; + typedef struct gpir_node { struct list_head list; gpir_op op; @@ -165,12 +166,14 @@ typedef struct gpir_node { int value_reg; union { struct { - int instr; + struct gpir_instr *instr; + struct gpir_store_node *physreg_store; int pos; int dist; int index; bool ready; bool inserted; + bool max_node, next_max_node; } sched; struct { int parent_index; @@ -223,7 +226,7 @@ typedef struct { struct list_head reg_link; } gpir_load_node; -typedef struct { +typedef struct gpir_store_node { gpir_node node; unsigned index; @@ -266,14 +269,43 @@ enum gpir_instr_slot { GPIR_INSTR_SLOT_DIST_TWO_END = GPIR_INSTR_SLOT_PASS, }; -typedef struct { +typedef struct gpir_instr { int index; struct list_head list; gpir_node *slots[GPIR_INSTR_SLOT_NUM]; + /* The number of ALU slots free for moves. */ int alu_num_slot_free; + + /* The number of ALU slots free for moves, except for the complex slot. */ + int alu_non_cplx_slot_free; + + /* We need to make sure that we can insert moves in the following cases: + * (1) There was a use of a value two cycles ago. + * (2) There were more than 5 uses of a value 1 cycle ago (or else we can't + * possibly satisfy (1) for the next cycle). + * (3) There is a store instruction scheduled, but not its child. + * + * The complex slot cannot be used for a move in case (1), since it only + * has a FIFO depth of 1, but it can be used for (2) and (3). In order to + * ensure that we have enough space for all three, we maintain the + * following invariants: + * + * (1) alu_num_slot_free >= alu_num_slot_needed_by_store + + * alu_num_slot_needed_by_max + + * alu_num_slot_needed_by_next_max + * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + */ int alu_num_slot_needed_by_store; + int alu_num_slot_needed_by_max; + int alu_num_slot_needed_by_next_max; + + /* Used to communicate to the scheduler how many slots need to be cleared + * up in order to satisfy the invariants. + */ + int slot_difference; + int non_cplx_slot_difference; int reg0_use_count; bool reg0_is_attr; @@ -387,18 +419,12 @@ bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node); void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node); void gpir_instr_print_prog(gpir_compiler *comp); -static inline bool gpir_instr_alu_slot_is_full(gpir_instr *instr) -{ - return instr->alu_num_slot_free <= instr->alu_num_slot_needed_by_store; -} - bool gpir_codegen_acc_same_op(gpir_op op1, gpir_op op2); bool gpir_pre_rsched_lower_prog(gpir_compiler *comp); bool gpir_post_rsched_lower_prog(gpir_compiler *comp); bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp); -bool gpir_value_regalloc_prog(gpir_compiler *comp); -bool gpir_physical_regalloc_prog(gpir_compiler *comp); +bool gpir_regalloc_prog(gpir_compiler *comp); bool gpir_schedule_prog(gpir_compiler *comp); bool gpir_codegen_prog(gpir_compiler *comp); diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c index d3190d39d4c..228d1459280 100644 --- a/src/gallium/drivers/lima/ir/gp/instr.c +++ b/src/gallium/drivers/lima/ir/gp/instr.c @@ -36,6 +36,7 @@ gpir_instr *gpir_instr_create(gpir_block *block) instr->index = block->sched.instr_index++; instr->alu_num_slot_free = 6; + instr->alu_non_cplx_slot_free = 5; list_add(&instr->list, &block->instr_list); return instr; @@ -85,6 +86,11 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) return false; int consume_slot = gpir_instr_get_consume_slot(instr, node); + int non_cplx_consume_slot = + node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot; + int store_reduce_slot = 0; + int max_reduce_slot = node->sched.max_node ? 1 : 0; + int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0; /* check if this node is child of one store node. * complex1 won't be any of this instr's store node's child, @@ -93,25 +99,40 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) for (int i = GPIR_INSTR_SLOT_STORE0; i <= GPIR_INSTR_SLOT_STORE3; i++) { gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { - /* acc node may consume 2 slots, so even it's the child of a - * store node, it may not be inserted successfully, in which - * case we need a move node for it */ - if (instr->alu_num_slot_free - consume_slot < - instr->alu_num_slot_needed_by_store - 1) - return false; - - instr->alu_num_slot_needed_by_store--; - instr->alu_num_slot_free -= consume_slot; - return true; + store_reduce_slot = 1; + break; } } - /* not a child of any store node, so must reserve alu slot for store node */ - if (instr->alu_num_slot_free - consume_slot < - instr->alu_num_slot_needed_by_store) + /* Check that the invariants will be maintained after we adjust everything + */ + + int slot_difference = + instr->alu_num_slot_needed_by_store - store_reduce_slot + + instr->alu_num_slot_needed_by_max - max_reduce_slot + + MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) - + (instr->alu_num_slot_free - consume_slot); + if (slot_difference > 0) { + gpir_debug("failed %d because of alu slot\n", node->index); + instr->slot_difference = slot_difference; + } + + int non_cplx_slot_difference = + instr->alu_num_slot_needed_by_max - max_reduce_slot - + (instr->alu_non_cplx_slot_free - non_cplx_consume_slot); + if (non_cplx_slot_difference > 0) { + gpir_debug("failed %d because of alu slot\n", node->index); + instr->non_cplx_slot_difference = non_cplx_slot_difference; + } + + if (slot_difference > 0 || non_cplx_slot_difference > 0) return false; instr->alu_num_slot_free -= consume_slot; + instr->alu_non_cplx_slot_free -= non_cplx_consume_slot; + instr->alu_num_slot_needed_by_store -= store_reduce_slot; + instr->alu_num_slot_needed_by_max -= max_reduce_slot; + instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot; return true; } @@ -123,12 +144,17 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node) gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { instr->alu_num_slot_needed_by_store++; - instr->alu_num_slot_free += consume_slot; - return; + break; } } instr->alu_num_slot_free += consume_slot; + if (node->sched.pos != GPIR_INSTR_SLOT_COMPLEX) + instr->alu_non_cplx_slot_free += consume_slot; + if (node->sched.max_node) + instr->alu_num_slot_needed_by_max++; + if (node->sched.next_max_node) + instr->alu_num_slot_needed_by_next_max++; } static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node) @@ -269,12 +295,18 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node) goto out; } - /* no store node has the same child as this node, and child is not - * already in this instr's alu slot, so instr must have some free - * alu slot to insert this node's child + /* Check the invariants documented in gpir.h, similar to the ALU case. + * Since the only thing that changes is alu_num_slot_needed_by_store, we + * can get away with just checking the first one. */ - if (gpir_instr_alu_slot_is_full(instr)) + int slot_difference = instr->alu_num_slot_needed_by_store + 1 + + instr->alu_num_slot_needed_by_max + + MAX2(instr->alu_num_slot_needed_by_next_max, 0) - + instr->alu_num_slot_free; + if (slot_difference > 0) { + instr->slot_difference = slot_difference; return false; + } instr->alu_num_slot_needed_by_store++; @@ -299,6 +331,9 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node) int other_slot = GPIR_INSTR_SLOT_STORE0 + (component ^ 1); for (int j = GPIR_INSTR_SLOT_STORE0; j <= GPIR_INSTR_SLOT_STORE3; j++) { + if (j == node->sched.pos) + continue; + gpir_store_node *s = gpir_node_to_store(instr->slots[j]); if (s && s->child == store->child) goto out; @@ -369,6 +404,9 @@ static bool gpir_instr_slot_free(gpir_instr *instr, gpir_node *node) bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node) { + instr->slot_difference = 0; + instr->non_cplx_slot_difference = 0; + if (!gpir_instr_slot_free(instr, node)) return false; @@ -413,7 +451,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node) /* This can happen if we merge duplicate loads in the scheduler. */ if (instr->slots[node->sched.pos] != node) { node->sched.pos = -1; - node->sched.instr = -1; + node->sched.instr = NULL; return; } @@ -439,7 +477,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node) instr->slots[GPIR_INSTR_SLOT_MUL1] = NULL; node->sched.pos = -1; - node->sched.instr = -1; + node->sched.instr = NULL; } void gpir_instr_print_prog(gpir_compiler *comp) diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c index 902d27a3149..d1da7ed3754 100644 --- a/src/gallium/drivers/lima/ir/gp/nir.c +++ b/src/gallium/drivers/lima/ir/gp/nir.c @@ -422,10 +422,7 @@ bool gpir_compile_nir(struct lima_vs_shader_state *prog, struct nir_shader *nir) if (!gpir_post_rsched_lower_prog(comp)) goto err_out0; - if (!gpir_value_regalloc_prog(comp)) - goto err_out0; - - if (!gpir_physical_regalloc_prog(comp)) + if (!gpir_regalloc_prog(comp)) goto err_out0; if (!gpir_schedule_prog(comp)) diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c index eda6ae7ed6f..decda5f1246 100644 --- a/src/gallium/drivers/lima/ir/gp/node.c +++ b/src/gallium/drivers/lima/ir/gp/node.c @@ -436,8 +436,6 @@ static void gpir_node_print_node(gpir_node *node, int type, int space) [GPIR_DEP_OFFSET] = "offset", [GPIR_DEP_READ_AFTER_WRITE] = "RaW", [GPIR_DEP_WRITE_AFTER_READ] = "WaR", - [GPIR_DEP_VREG_READ_AFTER_WRITE] = "vRaW", - [GPIR_DEP_VREG_WRITE_AFTER_READ] = "vWaR", }; for (int i = 0; i < space; i++) diff --git a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c b/src/gallium/drivers/lima/ir/gp/physical_regalloc.c deleted file mode 100644 index 87d88a8f9b7..00000000000 --- a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2017 Lima Project - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sub license, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - */ - -#include <limits.h> - -#include "gpir.h" - -/* Linear scan register alloc for physical reg alloc of each - * load/store node - */ - -static void regalloc_print_result(gpir_compiler *comp) -{ - if (!(lima_debug & LIMA_DEBUG_GP)) - return; - - int index = 0; - printf("======== physical regalloc ========\n"); - list_for_each_entry(gpir_block, block, &comp->block_list, list) { - list_for_each_entry(gpir_node, node, &block->node_list, list) { - if (node->op == gpir_op_load_reg) { - gpir_load_node *load = gpir_node_to_load(node); - printf("%03d: load %d use reg %d\n", index, node->index, load->reg->index); - } - else if (node->op == gpir_op_store_reg) { - gpir_store_node *store = gpir_node_to_store(node); - printf("%03d: store %d use reg %d\n", index, node->index, store->reg->index); - } - index++; - } - printf("----------------------------\n"); - } -} - -bool gpir_physical_regalloc_prog(gpir_compiler *comp) -{ - int index = 0; - list_for_each_entry(gpir_block, block, &comp->block_list, list) { - list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->preg.index = index++; - } - } - - /* calculate each reg liveness interval */ - list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) { - reg->start = INT_MAX; - list_for_each_entry(gpir_store_node, store, ®->defs_list, reg_link) { - if (store->node.preg.index < reg->start) - reg->start = store->node.preg.index; - } - - reg->end = 0; - list_for_each_entry(gpir_load_node, load, ®->uses_list, reg_link) { - if (load->node.preg.index > reg->end) - reg->end = load->node.preg.index; - } - } - - /* sort reg list by start value */ - struct list_head reg_list; - list_replace(&comp->reg_list, ®_list); - list_inithead(&comp->reg_list); - list_for_each_entry_safe(gpir_reg, reg, ®_list, list) { - struct list_head *insert_pos = &comp->reg_list; - list_for_each_entry(gpir_reg, creg, &comp->reg_list, list) { - if (creg->start > reg->start) { - insert_pos = &creg->list; - break; - } - } - list_del(®->list); - list_addtail(®->list, insert_pos); - } - - /* do linear scan reg alloc */ - gpir_reg *active[GPIR_PHYSICAL_REG_NUM] = {0}; - list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) { - int i; - - /* if some reg is expired */ - for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { - if (active[i] && active[i]->end <= reg->start) - active[i] = NULL; - } - - /* find a free reg value for this reg */ - for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) { - if (!active[i]) { - active[i] = reg; - reg->index = i; - break; - } - } - - /* TODO: support spill to temp memory */ - assert(i < GPIR_PHYSICAL_REG_NUM); - } - - /* update load/store node info for the real reg */ - list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) { - list_for_each_entry(gpir_store_node, store, ®->defs_list, reg_link) { - store->index = reg->index >> 2; - store->component = reg->index % 4; - } - - list_for_each_entry(gpir_load_node, load, ®->uses_list, reg_link) { - load->index = reg->index >> 2; - load->index = reg->index % 4; - } - } - - regalloc_print_result(comp); - return true; -} diff --git a/src/gallium/drivers/lima/ir/gp/value_regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c index f633b949932..c145bfbee81 100644 --- a/src/gallium/drivers/lima/ir/gp/value_regalloc.c +++ b/src/gallium/drivers/lima/ir/gp/regalloc.c @@ -24,60 +24,17 @@ #include "gpir.h" -/* Linear scan register alloc for value reg alloc of each node */ - -static int regalloc_spill_active_node(gpir_node *active[]) -{ - gpir_node *spill = NULL; - for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) { - if (gpir_op_infos[active[i]->op].spillless) - continue; - - /* spill farest node */ - if (!spill || - spill->vreg.last->vreg.index < active[i]->vreg.last->vreg.index) { - spill = active[i]; - } - } - - assert(spill); - gpir_debug("value regalloc spill node %d for value reg %d\n", - spill->index, spill->value_reg); - - /* create store node for spilled node */ - gpir_store_node *store = gpir_node_create(spill->block, gpir_op_store_reg); - store->child = spill; - /* no need to calculate other vreg values because store & spill won't - * be used in the following schedule again */ - store->node.value_reg = spill->value_reg; - list_addtail(&store->node.list, &spill->list); - - gpir_reg *reg = gpir_create_reg(spill->block->comp); - store->reg = reg; - list_addtail(&store->reg_link, ®->defs_list); - - gpir_node_foreach_succ_safe(spill, dep) { - gpir_node *succ = dep->succ; - gpir_load_node *load = gpir_node_create(succ->block, gpir_op_load_reg); - gpir_node_replace_pred(dep, &load->node); - gpir_node_replace_child(succ, spill, &load->node); - list_addtail(&load->node.list, &succ->list); - - /* only valid for succ already scheduled, succ not scheduled will - * re-write this value */ - load->node.value_reg = spill->value_reg; - load->node.vreg.index = - (list_first_entry(&load->node.list, gpir_node, list)->vreg.index + - list_last_entry(&load->node.list, gpir_node, list)->vreg.index) / 2.0f; - load->node.vreg.last = succ; - - load->reg = reg; - list_addtail(&load->reg_link, ®->uses_list); - } - - gpir_node_add_dep(&store->node, spill, GPIR_DEP_INPUT); - return spill->value_reg; -} +/* Register allocation + * + * TODO: This needs to be rewritten when we support multiple basic blocks. We + * need to do proper liveness analysis, combined with either linear scan, + * graph coloring, or SSA-based allocation. We should also support spilling to + * temporaries. + * + * For now, this only assigns fake registers to values, used to build the fake + * dependencies that the scheduler relies on. In the future we should also be + * assigning actual physreg numbers to load_reg/store_reg nodes. + */ static void regalloc_block(gpir_block *block) { @@ -99,7 +56,7 @@ static void regalloc_block(gpir_block *block) /* do linear scan regalloc */ int reg_search_start = 0; - gpir_node *active[GPIR_VALUE_REG_NUM] = {0}; + gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; list_for_each_entry(gpir_node, node, &block->node_list, list) { /* if some reg is expired */ gpir_node_foreach_pred(node, dep) { @@ -116,9 +73,9 @@ static void regalloc_block(gpir_block *block) /* find a free reg for this node */ int i; - for (i = 0; i < GPIR_VALUE_REG_NUM; i++) { + for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) { /* round robin reg select to reduce false dep when schedule */ - int reg = (reg_search_start + i) % GPIR_VALUE_REG_NUM; + int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); if (!active[reg]) { active[reg] = node; node->value_reg = reg; @@ -127,14 +84,8 @@ static void regalloc_block(gpir_block *block) } } - /* need spill */ - if (i == GPIR_VALUE_REG_NUM) { - int spilled_reg = regalloc_spill_active_node(active); - active[spilled_reg] = node; - node->value_reg = spilled_reg; - gpir_debug("value regalloc node %d reuse reg %d\n", - node->index, spilled_reg); - } + /* TODO: spill */ + assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM); } } @@ -144,7 +95,7 @@ static void regalloc_print_result(gpir_compiler *comp) return; int index = 0; - printf("======== value regalloc ========\n"); + printf("======== regalloc ========\n"); list_for_each_entry(gpir_block, block, &comp->block_list, list) { list_for_each_entry(gpir_node, node, &block->node_list, list) { printf("%03d: %d/%d %s ", index++, node->index, node->value_reg, @@ -159,7 +110,7 @@ static void regalloc_print_result(gpir_compiler *comp) } } -bool gpir_value_regalloc_prog(gpir_compiler *comp) +bool gpir_regalloc_prog(gpir_compiler *comp) { list_for_each_entry(gpir_block, block, &comp->block_list, list) { regalloc_block(block); diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c index 3d7e640ada2..60076daa3c4 100644 --- a/src/gallium/drivers/lima/ir/gp/scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/scheduler.c @@ -27,58 +27,196 @@ #include "gpir.h" /* - * GP schedule algorithm (by Connor Abbott <[email protected]>) + * GP scheduling algorithm (by Connor Abbott <[email protected]>) + * + * The GP pipeline has three main stages: * - * Pre schedule phase: - * 1. order all nodes in a sequence - * 2. convert the real reg read/write to GP load/store node, now all - * variable is SSA - * 3. do reg alloc for all SSA with 11 reg (value reg) and spill with - * load/store to real reg if needed - * 4. add fake dependency like this: - * after step 3, node sequence is - * 01: r1=r2+r3 - * 02: r4=r1+r2 - * 03: r1=r5+r6 - * we should add a fake dependency of node 3 to node 2 like a - * write-after-read dep. But this is not really write-after-read - * dep because there's no r1 really, because it's a value register. - * We need this fake dep in the schedule phase to make sure in any - * schedule point, there're only <=11 input needed by the past - * scheduled nodes. - * 5. build DAG according to all the real and fake dep + * -------------------------------------------------------- + * | | + * | Register/Attr/Temp Fetch | + * | | + * -------------------------------------------------------- + * | | | | | | | + * | Mul0 | Mul1 | Add0 | Add1 | Cplx | Pass | + * | | | | | | | + * -------------------------------------------------------- + * | | | | + * | Complex1 | Temp/Register/Varying | Pass | + * | Stage 2 | Store | Stage 2 | + * | | | | + * -------------------------------------------------------- * - * Schedule phase: - * 1. Compute the nodes ready to schedule, if no nodes, exit - * 2. Create a new GP instruction, and call it as current instr - * 3. For any nodes with a use 2 cycles ago with a definition ready to - * schedule, schedule that definition immediately if possible, or else - * schedule a move. - * 4. For any nodes with a use 2 cycles ago but the definition not - * scheduled and not ready to schedule, schedule a move immediately - * to prevent the value from falling off the queue. - * 5. Calculate the number of remaining nodes with a use 1 cycle ago but - * the definition not yet scheduled, and if there are more than 5, - * schedule moves or definitions for the rest now. - * 6. Schedule the rest of the available nodes using your favorite heuristic - * to current instr. - * 7. go to step 1 + * Because of this setup, storing a register has a latency of three cycles. + * Also, the register file is organized into 4-component vectors, and the + * load stage can only load two vectors at a time. Aside from these highly + * constrained register load/store units, there is an explicit bypass + * network, where each unit (mul0/mul1/etc.) can access the results of the + * any unit from the previous two cycles directly, except for the complex + * unit whose result can only be accessed for one cycle (since it's expected + * to be used directly by the complex2 instruction in the following cycle). * - * Step 5 for the current instruction guarantees that steps 3 and 4 for - * the next instruction will always succeed, so it's only step 5 that can - * possibly fail. Now, note that the nodes whose definitions have not yet - * been scheduled but one or more use has been scheduled, are exactly the - * nodes that are live in the final schedule. Therefore there will never - * be more than 11 of them (guarenteed by the 11 value reg alloc and the - * fake dep added before schedule). The worst case for step 5 is that all of - * these nodes had a use 1 cycle ago, which means that none of them hit - * case 3 or 4 already, so there are 6 slots still available so step 5 - * will always succeed. In general, even if there are exactly 11 values - * live, if n are scheduled in steps 3 and 4, there are 11-n left in step - * 4 so at most 11-n-5 = 6-n are scheduled in step 5 and therefore 6 are - * scheduled total, below the limit. So the algorithm will always succeed. + * Because of the very restricted register file, and because only rarely are + * all the units in use at the same time, it can be very beneficial to use + * the unused units to "thread" a value from source to destination by using + * moves in the otherwise-unused units, without involving the register file + * at all. It's very difficult to fully exploit this with a traditional + * scheduler, so we need to do something a little un-traditional. The 512 + * instruction limit means that for more complex shaders, we need to do as + * well as possible or else the app won't even work. + * + * The scheduler works by considering the bypass network as a kind of + * register file. It's a quite unusual register file, since registers have to + * be assigned "on the fly" as we schedule operations, but with some care, we + * can use something conceptually similar to a linear-scan allocator to + * successfully schedule nodes to instructions without running into + * conflicts. + * + * Values in the IR are separated into normal values, or "value registers", + * which is what normal nodes like add, mul, etc. produce, and which only + * live inside one basic block, and registers, which can span multiple basic + * blocks but have to be accessed via special load_reg/store_reg nodes. RA + * assigns physical registers to both value registers and normal registers, + * treating load_reg/store_reg as a move instruction, but these are only used + * directly for normal registers -- the physreg assigned to a value register + * is "fake," and is only used inside the scheduler. Before scheduling we + * insert read-after-write dependencies, even for value registers, as if + * we're going to use those, but then we throw them away. For example, if we + * had something like: + * + * (*)r2 = add (*)r1, (*)r2 + * (*)r1 = load_reg r0 + * + * we'd insert a write-after-read dependency between the add and load_reg, + * even though the starred registers aren't actually used by the scheduler + * after this step. This step is crucial since it guarantees that during any + * point in the schedule, the number of live registers + live value registers + * will never exceed the capacity of the register file and the bypass network + * combined. This is because each live register/value register will have a + * different fake number, thanks to the fake dependencies inserted before + * scheduling. This allows us to not have to worry about spilling to + * temporaries, which is only done ahead of time. + * + * The scheduler is a bottom-up scheduler. It keeps track of each live value + * register, and decides on-the-fly which value registers to keep in the + * bypass network and which to "spill" to registers. Of particular importance + * is the "ready list," which consists of "input nodes" (nodes that produce a + * value that can be consumed via the bypass network), both "partially ready" + * (only some of the uses have been scheduled) and "fully ready" (all uses + * have been scheduled), as well as other non-input nodes like register + * stores. Each input node on the ready list represents a live value register + * before the current instruction. There must be at most 11 such input nodes + * at all times, since there are only 11 slots in the next two instructions + * which can reach the current instruction. + * + * An input node is a "max node" if it has a use two cycles ago, which must be + * connected to a definition this cycle. Otherwise it may be a "next max node" + * if it will be a max node on the next instruction (i.e. it has a use at most + * one cycle ago), or it may be neither if all of its uses are this cycle. As + * we keep adding instructions to the front, input nodes graduate from + * neither, to next max, to max, unless we decide to insert a move to keep it + * alive longer, at which point any uses after the current instruction are + * rewritten to be uses of the move so that the original node returns to + * neither. The scheduler decides which nodes to try freely, but we have to + * reserve slots for two different reasons: (1) out of the 5 non-complex + * slots, we reserve a slot for each max node, so that we can connect a + * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a + * slot for every next-max node above 5, so that for the next instruction + * there are no more than 5 max nodes. When a max or next-max node gets + * scheduled, the corresponding reservation is reduced by one. At the end, we + * insert moves for every slot that was reserved. The reservation is actually + * managed by nir_instr, and all we have to do is tell it how many to reserve + * at the beginning and then tell it which nodes are max/next-max nodes. When + * we start scheduling an instruction, there will be at most 5 max nodes + * thanks to the previous instruction's next-max reservation/move insertion. + * Since there are at most 11 total input nodes, if there are N max nodes, + * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 = + * 6 - N slots need to be reserved for next-max nodes, and so at most + * 6 - N + N = 6 slots need to be reserved in total, exactly the total number + * of slots. So, thanks to the total input node restriction, we will never + * need to reserve too many slots. + * + * It sometimes happens that scheduling a given node will violate this total + * input node restriction, or that a reservation will mean that we can't + * schedule it. We first schedule a node "speculatively" to see if this is a + * problem. If some of the node's sources are loads, then we can schedule + * the node and its dependent loads in one swoop to avoid going over the + * pressure limit. If that fails, we can try to spill a ready or + * partially-ready input node to a register by rewriting all of its uses to + * refer to a register load. This removes it from the list of ready and + * partially ready input nodes as all of its uses are now unscheduled. If + * successful, we can then proceed with scheduling the original node. All of + * this happens "speculatively," meaning that afterwards the node is removed + * and the entire state of the scheduler is reverted to before it was tried, to + * ensure that we never get into an invalid state and run out of spots for + * moves. In try_nodes(), we try to schedule each node speculatively on the + * ready list, keeping only the nodes that could be successfully scheduled, so + * that when we finally decide which node to actually schedule, we know it + * will succeed. This is how we decide on the fly which values go in + * registers and which go in the bypass network. Note that "unspilling" a + * value is simply a matter of scheduling the store_reg instruction created + * when we spill. + * + * The careful accounting of live value registers, reservations for moves, and + * speculative scheduling guarantee that we never run into a failure case + * while scheduling. However, we need to make sure that this scheduler will + * not get stuck in an infinite loop, i.e. that we'll always make forward + * progress by eventually scheduling a non-move node. If we run out of value + * registers, then we may have to spill a node to a register. If we + * were to schedule one of the fully-ready nodes, then we'd have 11 + N live + * value registers before the current instruction. But since there are at most + * 64+11 live registers and register values total thanks to the fake + * dependencies we inserted before scheduling, there are at most 64 - N live + * physical registers, and therefore there are at least N registers available + * for spilling. Not all these registers will be available immediately, since + * in order to spill a node to a given register we have to ensure that there + * are slots available to rewrite every use to a load instruction, and that + * may not be the case. There may also be intervening writes which prevent + * some registers from being used. However, these are all temporary problems, + * since as we create each instruction, we create additional register load + * slots that can be freely used for spilling, and we create more move nodes + * which means that the uses of the nodes we're trying to spill keep moving + * forward. This means that eventually, these problems will go away, at which + * point we'll be able to spill a node successfully, so eventually we'll be + * able to schedule the first node on the ready list. */ +typedef struct { + /* This is the list of ready and partially-ready nodes. A partially-ready + * node must have at least one input dependency already scheduled. + */ + struct list_head ready_list; + + /* The number of ready or partially-ready nodes with at least one input + * dependency already scheduled. In other words, the number of live value + * registers. This must be at most 11. + */ + int ready_list_slots; + + /* The physical registers live into the current instruction. */ + uint64_t live_physregs; + + /* The current instruction. */ + gpir_instr *instr; + + /* The current basic block. */ + gpir_block *block; + + /* True if at least one node failed to schedule due to lack of available + * value registers. + */ + bool try_spill_all; + + /* The number of max nodes needed to spill to successfully schedule the + * instruction. + */ + int max_node_spill_needed; + + /* The number of max and next-max nodes needed to spill to successfully + * schedule the instruction. + */ + int total_spill_needed; +} sched_ctx; + static int gpir_min_dist_alu(gpir_dep *dep) { switch (dep->pred->op) { @@ -104,8 +242,13 @@ static int gpir_get_min_dist(gpir_dep *dep) case gpir_op_store_temp: case gpir_op_store_reg: case gpir_op_store_varying: - /* store must use alu node as input */ - if (dep->pred->type == gpir_node_type_load) + /* Stores must use an alu node as input. Also, complex1 takes two + * cycles, which means that its result cannot be stored to a register + * as part of the normal path, and therefore it must also have a move + * inserted. + */ + if (dep->pred->type == gpir_node_type_load || + dep->pred->op == gpir_op_complex1) return INT_MAX >> 2; else return 0; @@ -119,44 +262,24 @@ static int gpir_get_min_dist(gpir_dep *dep) return gpir_min_dist_alu(dep); case GPIR_DEP_READ_AFTER_WRITE: - switch (dep->succ->op) { - case gpir_op_load_temp: - assert(dep->pred->op == gpir_op_store_temp); + if (dep->succ->op == gpir_op_load_temp && + dep->pred->op == gpir_op_store_temp) { return 4; - case gpir_op_load_reg: - assert(dep->pred->op == gpir_op_store_reg); + } else if (dep->succ->op == gpir_op_load_reg && + dep->pred->op == gpir_op_store_reg) { return 3; - case gpir_op_load_uniform: - assert(dep->pred->op == gpir_op_store_temp_load_off0 || - dep->pred->op == gpir_op_store_temp_load_off1 || - dep->pred->op == gpir_op_store_temp_load_off2); + } else if ((dep->pred->op == gpir_op_store_temp_load_off0 || + dep->pred->op == gpir_op_store_temp_load_off1 || + dep->pred->op == gpir_op_store_temp_load_off2) && + dep->succ->op == gpir_op_load_uniform) { return 4; - default: - assert(0); + } else { + /* Fake dependency */ + return 0; } case GPIR_DEP_WRITE_AFTER_READ: - switch (dep->pred->op) { - case gpir_op_load_temp: - assert(dep->succ->op == gpir_op_store_temp); - return -3; - case gpir_op_load_reg: - assert(dep->succ->op == gpir_op_store_reg); - return -2; - case gpir_op_load_uniform: - assert(dep->succ->op == gpir_op_store_temp_load_off0 || - dep->succ->op == gpir_op_store_temp_load_off1 || - dep->succ->op == gpir_op_store_temp_load_off2); - return -3; - default: - assert(0); - } - - case GPIR_DEP_VREG_WRITE_AFTER_READ: return 0; - - case GPIR_DEP_VREG_READ_AFTER_WRITE: - assert(0); /* not possible, this is GPIR_DEP_INPUT */ } return 0; @@ -230,13 +353,64 @@ static void schedule_update_distance(gpir_node *node) if (pred->sched.dist < 0) schedule_update_distance(pred); - int dist = pred->sched.dist + 1; + int dist = pred->sched.dist + gpir_min_dist_alu(dep); if (node->sched.dist < dist) node->sched.dist = dist; } } -static void schedule_insert_ready_list(struct list_head *ready_list, +static bool gpir_is_input_node(gpir_node *node) +{ + gpir_node_foreach_succ(node, dep) { + if (dep->type == GPIR_DEP_INPUT) + return true; + } + return false; +} + + +/* Get the number of slots required for a node on the ready list. + */ +static int gpir_get_slots_required(gpir_node *node) +{ + if (!gpir_is_input_node(node)) + return 0; + + /* Note that we assume every node only consumes one slot, even dual-slot + * instructions. While dual-slot instructions may consume more than one + * slot, we can always safely insert a move if it turns out that there + * isn't enough space for them. There's the risk that we get stuck in an + * infinite loop if all the fully ready nodes are dual-slot nodes, but we + * rely on spilling to registers to save us here. + */ + return 1; +} + +static void verify_ready_list(sched_ctx *ctx) +{ + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (!gpir_is_input_node(node)) { + assert(node->sched.ready); + } + + if (node->sched.ready) { + /* Every successor must have been scheduled */ + gpir_node_foreach_succ(node, dep) { + assert(dep->succ->sched.instr); + } + } else { + /* There must be at least one successor that's not scheduled. */ + bool unscheduled = false; + gpir_node_foreach_succ(node, dep) { + unscheduled |= !(dep->succ->sched.instr); + } + + assert(unscheduled); + } + } +} + +static void schedule_insert_ready_list(sched_ctx *ctx, gpir_node *insert_node) { /* if this node is fully ready or partially ready @@ -250,7 +424,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list, bool ready = true, insert = false; gpir_node_foreach_succ(insert_node, dep) { gpir_node *succ = dep->succ; - if (succ->sched.instr >= 0) { + if (succ->sched.instr) { if (dep->type == GPIR_DEP_INPUT) insert = true; } @@ -265,8 +439,8 @@ static void schedule_insert_ready_list(struct list_head *ready_list, if (!insert || insert_node->sched.inserted) return; - struct list_head *insert_pos = ready_list; - list_for_each_entry(gpir_node, node, ready_list, list) { + struct list_head *insert_pos = &ctx->ready_list; + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { if (insert_node->sched.dist > node->sched.dist) { insert_pos = &node->list; break; @@ -275,6 +449,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list, list_addtail(&insert_node->list, insert_pos); insert_node->sched.inserted = true; + ctx->ready_list_slots += gpir_get_slots_required(insert_node); } static int gpir_get_max_start(gpir_node *node) @@ -284,10 +459,10 @@ static int gpir_get_max_start(gpir_node *node) /* find the max start instr constrainted by all successors */ gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; - if (succ->sched.instr < 0) + if (!succ->sched.instr) continue; - int start = succ->sched.instr + gpir_get_min_dist(dep); + int start = succ->sched.instr->index + gpir_get_min_dist(dep); if (start > max_start) max_start = start; } @@ -302,10 +477,10 @@ static int gpir_get_min_end(gpir_node *node) /* find the min end instr constrainted by all successors */ gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; - if (succ->sched.instr < 0) + if (!succ->sched.instr) continue; - int end = succ->sched.instr + gpir_get_max_dist(dep); + int end = succ->sched.instr->index + gpir_get_max_dist(dep); if (end < min_end) min_end = end; } @@ -330,11 +505,24 @@ static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node) return NULL; } -static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node) +/* Simply place the node into the given instruction without trying to deal + * with liveness or the ready list. This will only fail if the instruction + * cannot be placed due to a lack of available slots. In addition to normal + * node placement, this is also used for placing loads when spilling to + * registers. + */ +static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node) { if (node->type == gpir_node_type_load) { gpir_node *load = gpir_sched_instr_has_load(instr, node); if (load) { + /* This node may have a store as a successor, in which case we have to + * fail it exactly like below in order to later create a move node in + * between. + */ + if (instr->index < gpir_get_max_start(node)) + return false; + gpir_debug("same load %d in instr %d for node %d\n", load->index, instr->index, node->index); @@ -345,23 +533,100 @@ static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node) } } - node->sched.instr = instr->index; + node->sched.instr = instr; + int max_node_spill_needed = INT_MAX; + int total_spill_needed = INT_MAX; int *slots = gpir_op_infos[node->op].slots; for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) { node->sched.pos = slots[i]; - if (node->sched.instr >= gpir_get_max_start(node) && - node->sched.instr <= gpir_get_min_end(node) && + if (instr->index >= gpir_get_max_start(node) && + instr->index <= gpir_get_min_end(node) && gpir_instr_try_insert_node(instr, node)) return true; + if (ctx->instr->non_cplx_slot_difference || + ctx->instr->slot_difference) { + /* If one of these fields is non-zero, then we could insert the node + * here after spilling. To get an accurate count of how many nodes we + * need to spill, we need to choose one of the positions where there + * were nonzero slot differences, preferably one with the smallest + * difference (so we don't have to spill as much). + */ + if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed || + ctx->instr->slot_difference < total_spill_needed) { + max_node_spill_needed = ctx->instr->non_cplx_slot_difference; + total_spill_needed = ctx->instr->slot_difference; + } + } } - node->sched.instr = -1; + if (max_node_spill_needed != INT_MAX) { + /* Indicate how many spill nodes are needed. */ + ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed, + max_node_spill_needed); + ctx->total_spill_needed = MAX2(ctx->total_spill_needed, + total_spill_needed); + } + node->sched.instr = NULL; node->sched.pos = -1; return false; } -static gpir_node *schedule_create_move_node(gpir_node *node) +/* Try to place just the node given, updating the ready list. If "speculative" + * is true, then this is part ofthe pre-commit phase. If false, then we have + * committed to placing this node, so update liveness and ready list + * information. + */ + +static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node, + bool speculative) +{ + if (!_try_place_node(ctx, ctx->instr, node)) { + if (!speculative) + gpir_debug("failed to place %d\n", node->index); + return false; + } + + ctx->ready_list_slots -= gpir_get_slots_required(node); + + if (!speculative) { + gpir_debug("placed node %d\n", node->index); + + /* We assume here that writes are placed before reads. If this changes, + * then this needs to be updated. + */ + if (node->op == gpir_op_store_reg) { + gpir_store_node *store = gpir_node_to_store(node); + ctx->live_physregs &= + ~(1ull << (4 * store->index + store->component)); + if (store->child->sched.physreg_store == store) + store->child->sched.physreg_store = NULL; + } + + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + ctx->live_physregs |= + (1ull << (4 * load->index + load->component)); + } + + list_del(&node->list); + list_add(&node->list, &ctx->block->node_list); + gpir_node_foreach_pred(node, dep) { + gpir_node *pred = dep->pred; + schedule_insert_ready_list(ctx, pred); + } + } else { + gpir_node_foreach_pred(node, dep) { + gpir_node *pred = dep->pred; + if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT) + ctx->ready_list_slots += gpir_get_slots_required(pred); + } + } + + return true; +} + +static gpir_node *create_move(sched_ctx *ctx, gpir_node *node) { gpir_alu_node *move = gpir_node_create(node->block, gpir_op_mov); if (unlikely(!move)) @@ -370,187 +635,712 @@ static gpir_node *schedule_create_move_node(gpir_node *node) move->children[0] = node; move->num_child = 1; - move->node.sched.instr = -1; + move->node.sched.instr = NULL; move->node.sched.pos = -1; move->node.sched.dist = node->sched.dist; + move->node.sched.max_node = node->sched.max_node; + move->node.sched.next_max_node = node->sched.next_max_node; gpir_debug("create move %d for %d\n", move->node.index, node->index); + + ctx->ready_list_slots--; + list_del(&node->list); + node->sched.max_node = false; + node->sched.next_max_node = false; + node->sched.ready = false; + node->sched.inserted = false; + gpir_node_replace_succ(&move->node, node); + gpir_node_add_dep(&move->node, node, GPIR_DEP_INPUT); + schedule_insert_ready_list(ctx, &move->node); return &move->node; } -static gpir_node *gpir_sched_node(gpir_instr *instr, gpir_node *node) + +/* Once we schedule the successor, would the predecessor be fully ready? */ +static bool pred_almost_ready(gpir_dep *dep) { - if (node->op == gpir_op_mov) { - gpir_node *child = gpir_node_to_alu(node)->children[0]; - gpir_node_foreach_succ_safe(node, dep) { - gpir_node *succ = dep->succ; - if (succ->sched.instr < 0 || - instr->index < succ->sched.instr + gpir_get_min_dist(dep)) { - gpir_node_replace_pred(dep, child); - if (dep->type == GPIR_DEP_INPUT) - gpir_node_replace_child(succ, node, child); + bool fully_ready = true; + gpir_node_foreach_succ(dep->pred, other_dep) { + gpir_node *succ = other_dep->succ; + if (!succ->sched.instr && dep->succ != other_dep->succ) { + fully_ready = false; + break; + } + } + + return fully_ready; +} + +/* Recursively try to schedule a node and all its dependent nodes that can fit + * in the same instruction. There is a simple heuristic scoring system to try + * to group together nodes that load different components of the same input, + * to avoid bottlenecking for operations like matrix multiplies that are + * mostly input-bound. + */ + +static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) +{ + if (!schedule_try_place_node(ctx, node, speculative)) + return INT_MIN; + + int score = 0; + + gpir_node_foreach_pred(node, dep) { + if (!gpir_is_input_node(dep->pred)) + continue; + + int pred_score = INT_MIN; + if (pred_almost_ready(dep)) { + if (dep->pred->type == gpir_node_type_load || + node->type == gpir_node_type_store) { + pred_score = _schedule_try_node(ctx, dep->pred, speculative); + } + } + if (dep->pred->type == gpir_node_type_load || + node->type == gpir_node_type_store) { + if (pred_score == INT_MIN) { + if (node->op == gpir_op_mov) { + /* The only moves on the ready list are for loads that we + * couldn't schedule immediately, as created below. If we + * couldn't schedule the load, there's no point scheduling + * the move. The normal move threading logic will ensure + * that another move is created if we're about to go too far + * from the uses of this move. + */ + assert(speculative); + return INT_MIN; + } else if (!speculative && dep->pred->type == gpir_node_type_load) { + /* We couldn't schedule the load right away, so it will have + * to happen in some earlier instruction and then be moved + * into a value register and threaded to the use by "node". + * We create the move right away, so that later we'll fail + * to schedule it if there isn't a slot for a move + * available. + */ + create_move(ctx, dep->pred); + } + /* Penalize nodes whose dependent ops we couldn't schedule. + */ + score--; + } else { + score += pred_score; + continue; } } - MAYBE_UNUSED bool result = schedule_try_place_node(instr, node); - assert(result); - return node; } - else { - gpir_node *move = schedule_create_move_node(node); + + return score; +} + +/* If we speculatively tried a node, undo everything. + */ + +static void schedule_undo_node(sched_ctx *ctx, gpir_node *node) +{ + gpir_instr_remove_node(ctx->instr, node); + + gpir_node_foreach_pred(node, dep) { + gpir_node *pred = dep->pred; + if (pred->sched.instr) { + schedule_undo_node(ctx, pred); + } + } +} + +/* Try to schedule a node. We also try to schedule any predecessors that can + * be part of the same instruction. If "speculative" is true, then we don't + * actually change any state, only returning the score were the node to be + * scheduled, with INT_MIN meaning "cannot be scheduled at all". + */ +static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative) +{ + int prev_slots = ctx->ready_list_slots; + + int score = _schedule_try_node(ctx, node, speculative); + + if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) { + assert(speculative); + ctx->total_spill_needed = MAX2(ctx->total_spill_needed, + ctx->ready_list_slots - GPIR_VALUE_REG_NUM); + score = INT_MIN; + } + + if (speculative) { + ctx->ready_list_slots = prev_slots; + if (node->sched.instr) + schedule_undo_node(ctx, node); + } + + return score; +} + +/* This is called when we want to spill "node" by inserting loads at its uses. + * It returns all the possible registers we can use so that all the loads will + * successfully be inserted. Also return the first instruction we'll need to + * insert a load for. + */ + +static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node, + int *min_index) +{ + uint64_t available = ~0ull; + gpir_node_foreach_succ(node, dep) { + if (dep->type != GPIR_DEP_INPUT) + continue; + + gpir_node *use = dep->succ; + gpir_instr *instr = use->sched.instr; + + if (!instr) { + /* This use isn't scheduled, so no need to spill it. */ + continue; + } + + if (use->type == gpir_node_type_store) { + /* We're trying to spill something that was recently stored... just + * bail out. + */ + return 0; + } + + if (use->op == gpir_op_mov && instr == ctx->instr) { + /* We try to spill the sources of this move, so we can free up space + * in the current instruction. + * + * TODO: should we go back further? It might let us schedule the + * write earlier in some cases, but then we might fail to spill. + */ + available &= get_available_regs(ctx, use, min_index); + } else { + if (instr->index < *min_index) + *min_index = instr->index; + + uint64_t use_available = 0; + + if (instr->reg0_use_count == 0) + use_available = ~0ull; + else if (!instr->reg0_is_attr) + use_available = 0xf << (4 * instr->reg0_index); + + if (instr->reg1_use_count == 0) + use_available = ~0ull; + else + use_available |= 0xf << (4 * instr->reg1_index); + + available &= use_available; + } + } + + return available; +} + +/* Using "min_index" returned by get_available_regs(), figure out which + * registers are killed by a write after or during the current instruction and + * hence we can't use for spilling. Writes that haven't been scheduled yet + * should be reflected in live_physregs. + */ + +static uint64_t get_killed_regs(sched_ctx *ctx, int min_index) +{ + uint64_t killed = 0; + + list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) { + if (instr->index <= min_index) + break; + + for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; + slot++) { + if (!instr->slots[slot]) + continue; + + gpir_store_node *store = gpir_node_to_store(instr->slots[slot]); + if (store->node.op != gpir_op_store_reg) + continue; + + killed |= 1ull << (4 * store->index + store->component); + } + } + + return killed; +} + +/* Actually spill a node so that it is no longer in the ready list. Note that + * this must exactly follow the logic of get_available_regs() or else the + * loads could fail to schedule. + */ + +static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store) +{ + gpir_node_foreach_succ_safe(node, dep) { + if (dep->type != GPIR_DEP_INPUT) + continue; + + gpir_node *use = dep->succ; + gpir_instr *instr = use->sched.instr; + + if (!instr) + continue; + + if (use->op == gpir_op_mov && instr == ctx->instr) { + spill_node(ctx, use, store); + } else { + gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg); + load->index = store->index; + load->component = store->component; + list_add(&load->node.list, &ctx->block->node_list); + gpir_node_replace_child(dep->succ, dep->pred, &load->node); + gpir_node_replace_pred(dep, &load->node); + gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE); + gpir_debug("spilling use %d of node %d to load node %d\n", + use->index, node->index, load->node.index); + MAYBE_UNUSED bool result = _try_place_node(ctx, use->sched.instr, &load->node); + assert(result); + } + } + + if (node->op == gpir_op_mov) { + /* We replaced all the uses of the move, so it's dead now. */ + gpir_instr_remove_node(node->sched.instr, node); + gpir_node_delete(node); + } else { + /* We deleted all the uses of the node except the store, so it's not + * live anymore. + */ list_del(&node->list); - node->sched.ready = false; node->sched.inserted = false; - gpir_node_replace_succ(move, node); - gpir_node_add_dep(move, node, GPIR_DEP_INPUT); - return move; + ctx->ready_list_slots--; + if (node->sched.max_node) { + node->sched.max_node = false; + ctx->instr->alu_num_slot_needed_by_max--; + } + if (node->sched.next_max_node) { + node->sched.next_max_node = false; + ctx->instr->alu_num_slot_needed_by_next_max--; + } } } -static bool gpir_is_input_node(gpir_node *node) +static bool used_by_store(gpir_node *node, gpir_instr *instr) { gpir_node_foreach_succ(node, dep) { - if (dep->type == GPIR_DEP_INPUT) + if (dep->type != GPIR_DEP_INPUT) + continue; + + if (dep->succ->type == gpir_node_type_store && + dep->succ->sched.instr == instr) return true; } + return false; } -static int gpir_get_min_scheduled_succ(gpir_node *node) + + +static bool try_spill_node(sched_ctx *ctx, gpir_node *node) +{ + assert(node->op != gpir_op_mov); + + if (used_by_store(node, ctx->instr)) + return false; + + gpir_debug("trying to spill %d\n", node->index); + + int min_instr = INT_MAX; + uint64_t available = get_available_regs(ctx, node, &min_instr); + available &= ~get_killed_regs(ctx, min_instr); + + if (node->sched.physreg_store) { + gpir_store_node *store = node->sched.physreg_store; + if (!(available & (1ull << (4 * store->index + store->component)))) + return false; + } else { + available &= ~ctx->live_physregs; + + if (available == 0) + return false; + + /* TODO: use a better heuristic for choosing an available register? */ + int physreg = ffsll(available) - 1; + + ctx->live_physregs |= (1ull << physreg); + + /* TODO: when we support multiple basic blocks, there may be register + * loads/stores to this register other than this one that haven't been + * scheduled yet so we may need to insert write-after-read dependencies. + */ + gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg); + store->index = physreg / 4; + store->component = physreg % 4; + store->child = node; + store->node.sched.max_node = false; + store->node.sched.next_max_node = false; + store->node.sched.pos = -1; + store->node.sched.instr = NULL; + store->node.sched.inserted = false; + store->node.sched.dist = node->sched.dist; + if (node->op == gpir_op_complex1) { + /* Complex1 cannot be directly stored, and has a latency of 2 */ + store->node.sched.dist += 2; + } + node->sched.physreg_store = store; + gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT); + node->sched.ready = false; + schedule_insert_ready_list(ctx, &store->node); + } + + gpir_debug("spilling %d to $%d.%c, store %d\n", node->index, + node->sched.physreg_store->index, + "xyzw"[node->sched.physreg_store->component], + node->sched.physreg_store->node.index); + + spill_node(ctx, node, node->sched.physreg_store); + + return true; +} + +static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node) +{ + /* First, try to spill max nodes. */ + list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { + if (ctx->max_node_spill_needed <= 0) + break; + + /* orig_node is the node we're trying to schedule, so spilling it makes + * no sense. Also don't try to spill any nodes in front of it, since + * they might be scheduled instead. + */ + if (node == orig_node) + break; + + if (node->op == gpir_op_mov) { + /* Don't try to spill loads, since that only adds another load and + * store which is likely pointless. + */ + continue; + } + + if (!gpir_is_input_node(node) || !node->sched.max_node) + continue; + + if (try_spill_node(ctx, node)) { + ctx->max_node_spill_needed--; + ctx->total_spill_needed--; + } + } + + /* Now, try to spill the remaining nodes. */ + list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) { + if (ctx->total_spill_needed <= 0) + break; + + if (node == orig_node) + break; + + if (node->op == gpir_op_mov) + continue; + + if (!gpir_is_input_node(node) || + !(node->sched.max_node || node->sched.next_max_node)) + continue; + + if (try_spill_node(ctx, node)) + ctx->total_spill_needed--; + } + + return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0; +} + +static int gpir_get_curr_ready_list_slots(sched_ctx *ctx) +{ + int total = 0; + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + total += gpir_get_slots_required(node); + } + + return total; +} + +/* What gpir_get_min_end() would return if node were replaced with a move + * instruction not in the complex slot. Normally this is 2 + min_end, except + * for some store instructions which must have the move node in the same + * instruction. + */ +static int gpir_get_min_end_as_move(gpir_node *node) { int min = INT_MAX; gpir_node_foreach_succ(node, dep) { gpir_node *succ = dep->succ; - if (succ->sched.instr >= 0 && dep->type == GPIR_DEP_INPUT) { - if (min > succ->sched.instr) - min = succ->sched.instr; + if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) { + switch (succ->op) { + case gpir_op_store_temp: + case gpir_op_store_reg: + case gpir_op_store_varying: + continue; + default: + break; + } + if (min > succ->sched.instr->index + 2) + min = succ->sched.instr->index + 2; } } return min; } -static gpir_node *gpir_sched_instr_pass(gpir_instr *instr, - struct list_head *ready_list) +/* Initialize node->sched.max_node and node->sched.next_max_node for every + * input node on the ready list. We should only need to do this once per + * instruction, at the beginning, since we never add max nodes to the ready + * list. + */ + +static void sched_find_max_nodes(sched_ctx *ctx) { - /* fully ready node reach its max dist with any of its successor */ - list_for_each_entry_safe(gpir_node, node, ready_list, list) { - if (node->sched.ready) { - int end = gpir_get_min_end(node); - assert(end >= instr->index); - if (instr->index < end) - continue; + ctx->instr->alu_num_slot_needed_by_next_max = -5; + ctx->instr->alu_num_slot_needed_by_max = 0; - gpir_debug("fully ready max node %d\n", node->index); + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (!gpir_is_input_node(node)) + continue; - if (schedule_try_place_node(instr, node)) - return node; + int min_end_move = gpir_get_min_end_as_move(node); + node->sched.max_node = (min_end_move == ctx->instr->index); + node->sched.next_max_node = (min_end_move == ctx->instr->index + 1); - return gpir_sched_node(instr, node); - } + if (node->sched.max_node) + ctx->instr->alu_num_slot_needed_by_max++; + if (node->sched.next_max_node) + ctx->instr->alu_num_slot_needed_by_next_max++; } +} - /* partially ready node reach its max dist with any of its successor */ - list_for_each_entry_safe(gpir_node, node, ready_list, list) { - if (!node->sched.ready) { - int end = gpir_get_min_end(node); - assert(end >= instr->index); - if (instr->index < end) - continue; +/* Verify the invariants described in gpir.h, as well as making sure the + * counts are correct. + */ +static void verify_max_nodes(sched_ctx *ctx) +{ + int alu_num_slot_needed_by_max = 0; + int alu_num_slot_needed_by_next_max = -5; + int alu_num_slot_needed_by_store = 0; - gpir_debug("partially ready max node %d\n", node->index); + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (!gpir_is_input_node(node)) + continue; - return gpir_sched_node(instr, node); - } + if (node->sched.max_node) + alu_num_slot_needed_by_max++; + if (node->sched.next_max_node) + alu_num_slot_needed_by_next_max++; + if (used_by_store(node, ctx->instr)) + alu_num_slot_needed_by_store++; } - /* schedule node used by previous instr when count > 5 */ - int count = 0; - gpir_node *two_slot_node = NULL; - list_for_each_entry(gpir_node, node, ready_list, list) { - if (gpir_is_input_node(node)) { - int min = gpir_get_min_scheduled_succ(node); - assert(min >= instr->index - 1); - if (min == instr->index - 1) { - if (gpir_op_infos[node->op].may_consume_two_slots) { - two_slot_node = node; - count += 2; + assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max); + assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max); + assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store); + assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max); + assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max); +} + +static bool try_node(sched_ctx *ctx) +{ + gpir_node *best_node = NULL; + int best_score = INT_MIN; + + /* Spilling will delete arbitrary nodes after the current one in the ready + * list, which means that we always need to look up the next node in the + * list at the end of each iteration. While list_for_each_entry() works for + * this purpose, its sanity checking assumes that you don't want to modify + * the list at all. We know better here, so we have to open-code + * list_for_each_entry() without the check in order to not assert. + */ + for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list); + &node->list != &ctx->ready_list; + node = LIST_ENTRY(gpir_node, node->list.next, list)) { + if (best_score != INT_MIN) { + if (node->sched.dist < best_node->sched.dist) + break; + } + + if (node->sched.ready) { + ctx->total_spill_needed = 0; + ctx->max_node_spill_needed = 0; + int score = schedule_try_node(ctx, node, true); + if (score == INT_MIN) { + if (ctx->total_spill_needed > 0 && + try_spill_nodes(ctx, node)) { + score = schedule_try_node(ctx, node, true); + if (score == INT_MIN) + continue; } - else - count++; + } + + if (score > best_score) { + best_score = score; + best_node = node; } } } - if (count > 5) { - /* When no slot avaible, must schedule a move for two slot node - * to reduce the count. This results from the dummy_m/f method. - */ - if (gpir_instr_alu_slot_is_full(instr)) { - assert(two_slot_node); - gpir_debug("instr is full, schedule move node for two slot node %d\n", - two_slot_node->index); - return gpir_sched_node(instr, two_slot_node); - } - - /* schedule fully ready node first */ - list_for_each_entry(gpir_node, node, ready_list, list) { - if (gpir_is_input_node(node)) { - int min = gpir_get_min_scheduled_succ(node); - if (min == instr->index - 1 && node->sched.ready) { - gpir_debug(">5 ready node %d\n", node->index); - - if (schedule_try_place_node(instr, node)) - return node; - } - } + if (best_node) { + gpir_debug("scheduling %d (score = %d)%s\n", best_node->index, + best_score, best_node->sched.max_node ? " (max)" : ""); + MAYBE_UNUSED int score = schedule_try_node(ctx, best_node, false); + assert(score != INT_MIN); + return true; + } + + return false; +} + +static void place_move(sched_ctx *ctx, gpir_node *node) +{ + gpir_node *move = create_move(ctx, node); + gpir_node_foreach_succ_safe(move, dep) { + gpir_node *succ = dep->succ; + if (!succ->sched.instr || + ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) { + gpir_node_replace_pred(dep, node); + if (dep->type == GPIR_DEP_INPUT) + gpir_node_replace_child(succ, move, node); } + } + MAYBE_UNUSED int score = schedule_try_node(ctx, move, false); + assert(score != INT_MIN); +} - /* no fully ready node be scheduled, schedule partially ready node */ - list_for_each_entry_safe(gpir_node, node, ready_list, list) { - if (gpir_is_input_node(node)) { - int min = gpir_get_min_scheduled_succ(node); - if (min == instr->index - 1 && !node->sched.ready) { - gpir_debug(">5 partially ready node %d\n", node->index); +static bool sched_move(sched_ctx *ctx) +{ + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (node->sched.max_node) { + place_move(ctx, node); + return true; + } + } - return gpir_sched_node(instr, node); + if (ctx->instr->alu_num_slot_needed_by_store > 0) { + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (used_by_store(node, ctx->instr)) { + place_move(ctx, node); + /* If we have a store of a load, then we need to make sure that we + * immediately schedule the dependent load, or create a move + * instruction for it, like we would with a normal instruction. + * The rest of the code isn't set up to handle load nodes in the + * ready list -- see the comments in _schedule_try_node(). + */ + if (node->type == gpir_node_type_load) { + if (!schedule_try_place_node(ctx, node, false)) { + create_move(ctx, node); + } } + return true; } } + } - /* finally schedule move for fully ready node */ - list_for_each_entry_safe(gpir_node, node, ready_list, list) { - if (gpir_is_input_node(node)) { - int min = gpir_get_min_scheduled_succ(node); - if (min == instr->index - 1 && node->sched.ready) { - gpir_debug(">5 fully ready move node %d\n", node->index); - - return gpir_sched_node(instr, node); + /* complex1 is a bit a special case, since it has a latency of 2 cycles. + * Once it is fully ready, we need to group all its uses in the same + * instruction, and then we need to avoid creating any moves in the next + * cycle in order to get it scheduled. Failing to do any of these things + * could result in a cycle penalty, or even worse, an infinite loop of + * inserting moves. If it is a next-max node and ready, then it has a use + * in the previous cycle. If it has a use in the current cycle as well, + * then we want to insert a move node to make it ready in two cycles -- if + * we don't, then there will be at least a one cycle penalty. Otherwise, it + * will be ready next cycle, and we shouldn't insert a move node, or else + * we'll also have a one cycle penalty. + */ + if (ctx->instr->alu_num_slot_free > 0) { + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (node->sched.next_max_node && node->op == gpir_op_complex1 && + node->sched.ready) { + bool skip = true; + gpir_node_foreach_succ(node, dep) { + if (dep->type != GPIR_DEP_INPUT) + continue; + + gpir_node *succ = dep->succ; + + if (!succ->sched.instr || + succ->sched.instr->index != ctx->instr->index - 1) { + skip = false; + break; + } } + + if (skip) + continue; + + place_move(ctx, node); + return true; } } } - /* schedule remain fully ready nodes */ - list_for_each_entry(gpir_node, node, ready_list, list) { - if (node->sched.ready) { - gpir_debug("remain fully ready node %d\n", node->index); + /* Once we've made all the required moves, we're free to use any extra + * slots to schedule more moves for next max nodes. Besides sometimes being + * necessary, this can free up extra space in the next instruction. We walk + * from back to front so that we pick nodes less likely to be scheduled + * next first -- an extra move would be unnecessary there. But make sure + * not to handle the complex1 case handled above. + */ + if (ctx->instr->alu_num_slot_free > 0) { + list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) { + if (node->sched.next_max_node && + !(node->op == gpir_op_complex1 && node->sched.ready)) { + place_move(ctx, node); + return true; + } + } + } - if (schedule_try_place_node(instr, node)) - return node; + /* We may have skipped complex1 above, but if we run out of space, we still + * need to insert the move. + */ + + if (ctx->instr->alu_num_slot_needed_by_next_max > 0) { + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (node->sched.next_max_node) { + place_move(ctx, node); + return true; + } } } - return NULL; + + return false; } -static void schedule_print_pre_one_instr(gpir_instr *instr, - struct list_head *ready_list) +static bool gpir_sched_instr_pass(sched_ctx *ctx) +{ + if (try_node(ctx)) + return true; + + if (sched_move(ctx)) + return true; + + return false; +} + +static void schedule_print_pre_one_instr(sched_ctx *ctx) { if (!(lima_debug & LIMA_DEBUG_GP)) return; - printf("instr %d for ready list:", instr->index); - list_for_each_entry(gpir_node, node, ready_list, list) { - printf(" %d/%c", node->index, node->sched.ready ? 'r' : 'p'); + printf("instr %d for ready list:", ctx->instr->index); + list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p', + node->sched.dist, gpir_get_slots_required(node), + node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none")); + } + printf("\nlive physregs: "); + for (unsigned i = 0; i < 16; i++) { + if (ctx->live_physregs & (0xfull << (4 * i))) { + printf("$%d.", i); + for (unsigned j = 0; j < 4; j++) { + if (ctx->live_physregs & (1ull << (4 * i + j))) + printf("%c", "xyzw"[j]); + } + printf(" "); + } } printf("\n"); } @@ -569,30 +1359,23 @@ static void schedule_print_post_one_instr(gpir_instr *instr) } -static bool schedule_one_instr(gpir_block *block, struct list_head *ready_list) +static bool schedule_one_instr(sched_ctx *ctx) { - gpir_instr *instr = gpir_instr_create(block); + gpir_instr *instr = gpir_instr_create(ctx->block); if (unlikely(!instr)) return false; - schedule_print_pre_one_instr(instr, ready_list); - - while (true) { - gpir_node *node = gpir_sched_instr_pass(instr, ready_list); - if (!node) - break; + ctx->instr = instr; - if (node->sched.instr < 0) - schedule_insert_ready_list(ready_list, node); - else { - list_del(&node->list); - list_add(&node->list, &block->node_list); + sched_find_max_nodes(ctx); + schedule_print_pre_one_instr(ctx); - gpir_node_foreach_pred(node, dep) { - gpir_node *pred = dep->pred; - schedule_insert_ready_list(ready_list, pred); - } - } + while (gpir_sched_instr_pass(ctx)) { + assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx)); +#ifndef NDEBUG + verify_max_nodes(ctx); + verify_ready_list(ctx); +#endif } schedule_print_post_one_instr(instr); @@ -607,45 +1390,33 @@ static bool schedule_block(gpir_block *block) schedule_update_distance(node); } - struct list_head ready_list; - list_inithead(&ready_list); + sched_ctx ctx; + list_inithead(&ctx.ready_list); + ctx.block = block; + ctx.ready_list_slots = 0; + /* TODO initialize with block live out once we have proper liveness + * tracking + */ + ctx.live_physregs = 0; /* construct the ready list from root nodes */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { if (gpir_node_is_root(node)) - schedule_insert_ready_list(&ready_list, node); + schedule_insert_ready_list(&ctx, node); } list_inithead(&block->node_list); - while (!list_empty(&ready_list)) { - if (!schedule_one_instr(block, &ready_list)) + while (!list_empty(&ctx.ready_list)) { + if (!schedule_one_instr(&ctx)) return false; } return true; } -static void schedule_build_vreg_dependency(gpir_block *block) +static void schedule_build_dependency(gpir_block *block) { - gpir_node *regs[GPIR_VALUE_REG_NUM] = {0}; - list_for_each_entry(gpir_node, node, &block->node_list, list) { - /* store node has no value reg assigned */ - if (node->value_reg < 0) - continue; - - gpir_node *reg = regs[node->value_reg]; - if (reg) { - gpir_node_foreach_succ(reg, dep) { - /* write after read dep should only apply to real 'read' */ - if (dep->type != GPIR_DEP_INPUT) - continue; - - gpir_node *succ = dep->succ; - gpir_node_add_dep(node, succ, GPIR_DEP_VREG_WRITE_AFTER_READ); - } - } - regs[node->value_reg] = node; - } + gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; /* merge dummy_f/m to the node created from */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { @@ -666,76 +1437,49 @@ static void schedule_build_vreg_dependency(gpir_block *block) gpir_node_delete(node); } } -} -static void schedule_build_preg_dependency(gpir_compiler *comp) -{ - /* merge reg with the same index */ - gpir_reg *regs[GPIR_VALUE_REG_NUM] = {0}; - list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) { - if (!regs[reg->index]) - regs[reg->index] = reg; - else { - list_splicetail(®->defs_list, ®s[reg->index]->defs_list); - list_splicetail(®->uses_list, ®s[reg->index]->uses_list); + /* Forward dependencies. We only need to add these for register loads, + * since value registers already have an input dependency. + */ + list_for_each_entry(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned index = 4 * load->index + load->component; + if (last_written[index]) { + gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE); + } } + + if (node->value_reg >= 0) + last_written[node->value_reg] = node; } - /* calculate physical reg read/write dependency for load/store nodes */ - for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) { - gpir_reg *reg = regs[i]; - if (!reg) - continue; + memset(last_written, 0, sizeof(last_written)); - /* sort reg write */ - struct list_head tmp_list; - list_replace(®->defs_list, &tmp_list); - list_inithead(®->defs_list); - list_for_each_entry_safe(gpir_store_node, store, &tmp_list, reg_link) { - struct list_head *insert_pos = ®->defs_list; - list_for_each_entry(gpir_store_node, st, ®->defs_list, reg_link) { - if (st->node.sched.index > store->node.sched.index) { - insert_pos = &st->reg_link; - break; - } + /* False dependencies. For value registers, these exist only to make sure + * that the maximum pressure isn't exceeded and are hence "fake". + */ + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + unsigned index = 4 * load->index + load->component; + if (last_written[index]) { + gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ); } - list_del(&store->reg_link); - list_addtail(&store->reg_link, insert_pos); - } - - /* sort reg read */ - list_replace(®->uses_list, &tmp_list); - list_inithead(®->uses_list); - list_for_each_entry_safe(gpir_load_node, load, &tmp_list, reg_link) { - struct list_head *insert_pos = ®->uses_list; - list_for_each_entry(gpir_load_node, ld, ®->uses_list, reg_link) { - if (ld->node.sched.index > load->node.sched.index) { - insert_pos = &ld->reg_link; - break; + } else { + gpir_node_foreach_pred(node, dep) { + if (dep->type == GPIR_DEP_INPUT) { + int index = dep->pred->value_reg; + if (index >= 0 && last_written[index]) { + gpir_node_add_dep(last_written[index], node, + GPIR_DEP_WRITE_AFTER_READ); + } } } - list_del(&load->reg_link); - list_addtail(&load->reg_link, insert_pos); - } - - /* insert dependency */ - gpir_store_node *store = - list_first_entry(®->defs_list, gpir_store_node, reg_link); - gpir_store_node *next = store->reg_link.next != ®->defs_list ? - list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL; - - list_for_each_entry(gpir_load_node, load, ®->uses_list, reg_link) { - /* loop until load is between store and next */ - while (next && next->node.sched.index < load->node.sched.index) { - store = next; - next = store->reg_link.next != ®->defs_list ? - list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL; - } - - gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE); - if (next) - gpir_node_add_dep(&next->node, &load->node, GPIR_DEP_WRITE_AFTER_READ); } + + if (node->value_reg >= 0) + last_written[node->value_reg] = node; } } @@ -792,20 +1536,25 @@ bool gpir_schedule_prog(gpir_compiler *comp) list_for_each_entry(gpir_block, block, &comp->block_list, list) { block->sched.instr_index = 0; list_for_each_entry(gpir_node, node, &block->node_list, list) { - node->sched.instr = -1; + node->sched.instr = NULL; node->sched.pos = -1; node->sched.index = index++; node->sched.dist = -1; + /* TODO when we support multiple basic blocks, we need a way to keep + * track of this for physregs allocated before the scheduler. + */ + node->sched.physreg_store = NULL; node->sched.ready = false; node->sched.inserted = false; + node->sched.max_node = false; + node->sched.next_max_node = false; } } - /* build fake/virtual dependency */ + /* build dependency */ list_for_each_entry(gpir_block, block, &comp->block_list, list) { - schedule_build_vreg_dependency(block); + schedule_build_dependency(block); } - schedule_build_preg_dependency(comp); //gpir_debug("after scheduler build reg dependency\n"); //gpir_node_print_prog_dep(comp); diff --git a/src/gallium/drivers/lima/meson.build b/src/gallium/drivers/lima/meson.build index 1db8c7ea8f9..0f13b9c7272 100644 --- a/src/gallium/drivers/lima/meson.build +++ b/src/gallium/drivers/lima/meson.build @@ -29,8 +29,7 @@ files_lima = files( 'ir/gp/codegen.h', 'ir/gp/codegen.c', 'ir/gp/reduce_scheduler.c', - 'ir/gp/value_regalloc.c', - 'ir/gp/physical_regalloc.c', + 'ir/gp/regalloc.c', 'ir/gp/disasm.c', 'ir/pp/ppir.h', |