9 files changed, 1187 insertions, 560 deletions
diff --git a/src/gallium/drivers/lima/ir/gp/codegen.c b/src/gallium/drivers/lima/ir/gp/codegen.c
index 1b7c8903c97..9bc279e0119 100644
--- a/src/gallium/drivers/lima/ir/gp/codegen.c
+++ b/src/gallium/drivers/lima/ir/gp/codegen.c
@@ -76,9 +76,13 @@ static gpir_codegen_src gpir_get_alu_input(gpir_node *parent, gpir_node *child)
          gpir_codegen_src_load_w, gpir_codegen_src_unused, gpir_codegen_src_unused },
    };
 
-   assert(child->sched.instr - parent->sched.instr < 3);
+   int diff = child->sched.instr->index - parent->sched.instr->index;
+   assert(diff < 3);
+   assert(diff >= 0);
 
-   return slot_to_src[child->sched.pos][child->sched.instr - parent->sched.instr];
+   int src = slot_to_src[child->sched.pos][diff];
+   assert(src != gpir_codegen_src_unused);
+   return src;
 }
 
 static void gpir_codegen_mul0_slot(gpir_codegen_instr *code, gpir_instr *instr)
diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index e7f199033cd..6a1d533977b 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -131,8 +131,6 @@ typedef struct {
       GPIR_DEP_OFFSET,    /* def is the offset of use (i.e. temp store) */
       GPIR_DEP_READ_AFTER_WRITE,
       GPIR_DEP_WRITE_AFTER_READ,
-      GPIR_DEP_VREG_READ_AFTER_WRITE,
-      GPIR_DEP_VREG_WRITE_AFTER_READ,
    } type;
 
    /* node execute before succ */
@@ -146,6 +144,9 @@ typedef struct {
    struct list_head succ_link;
 } gpir_dep;
 
+struct gpir_instr;
+struct gpir_store_node;
+
 typedef struct gpir_node {
    struct list_head list;
    gpir_op op;
@@ -165,12 +166,14 @@ typedef struct gpir_node {
    int value_reg;
    union {
       struct {
-         int instr;
+         struct gpir_instr *instr;
+         struct gpir_store_node *physreg_store;
          int pos;
          int dist;
          int index;
          bool ready;
          bool inserted;
+         bool max_node, next_max_node;
       } sched;
       struct {
          int parent_index;
@@ -223,7 +226,7 @@ typedef struct {
    struct list_head reg_link;
 } gpir_load_node;
 
-typedef struct {
+typedef struct gpir_store_node {
    gpir_node node;
 
    unsigned index;
@@ -266,14 +269,43 @@ enum gpir_instr_slot {
    GPIR_INSTR_SLOT_DIST_TWO_END   = GPIR_INSTR_SLOT_PASS,
 };
 
-typedef struct {
+typedef struct gpir_instr {
    int index;
    struct list_head list;
 
    gpir_node *slots[GPIR_INSTR_SLOT_NUM];
 
+   /* The number of ALU slots free for moves. */
    int alu_num_slot_free;
+
+   /* The number of ALU slots free for moves, except for the complex slot. */
+   int alu_non_cplx_slot_free;
+
+   /* We need to make sure that we can insert moves in the following cases:
+    * (1) There was a use of a value two cycles ago.
+    * (2) There were more than 5 uses of a value 1 cycle ago (or else we can't
+    *     possibly satisfy (1) for the next cycle).
+    * (3) There is a store instruction scheduled, but not its child.
+    *
+    * The complex slot cannot be used for a move in case (1), since it only
+    * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
+    * ensure that we have enough space for all three, we maintain the
+    * following invariants:
+    *
+    * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
+    *       alu_num_slot_needed_by_max +
+    *       alu_num_slot_needed_by_next_max
+    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+    */
    int alu_num_slot_needed_by_store;
+   int alu_num_slot_needed_by_max;
+   int alu_num_slot_needed_by_next_max;
+
+   /* Used to communicate to the scheduler how many slots need to be cleared
+    * up in order to satisfy the invariants.
+    */
+   int slot_difference;
+   int non_cplx_slot_difference;
 
    int reg0_use_count;
    bool reg0_is_attr;
@@ -387,18 +419,12 @@ bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node);
 void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node);
 void gpir_instr_print_prog(gpir_compiler *comp);
 
-static inline bool gpir_instr_alu_slot_is_full(gpir_instr *instr)
-{
-   return instr->alu_num_slot_free <= instr->alu_num_slot_needed_by_store;
-}
-
 bool gpir_codegen_acc_same_op(gpir_op op1, gpir_op op2);
 
 bool gpir_pre_rsched_lower_prog(gpir_compiler *comp);
 bool gpir_post_rsched_lower_prog(gpir_compiler *comp);
 bool gpir_reduce_reg_pressure_schedule_prog(gpir_compiler *comp);
-bool gpir_value_regalloc_prog(gpir_compiler *comp);
-bool gpir_physical_regalloc_prog(gpir_compiler *comp);
+bool gpir_regalloc_prog(gpir_compiler *comp);
 bool gpir_schedule_prog(gpir_compiler *comp);
 bool gpir_codegen_prog(gpir_compiler *comp);
 
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index d3190d39d4c..228d1459280 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -36,6 +36,7 @@ gpir_instr *gpir_instr_create(gpir_block *block)
 
    instr->index = block->sched.instr_index++;
    instr->alu_num_slot_free = 6;
+   instr->alu_non_cplx_slot_free = 5;
 
    list_add(&instr->list, &block->instr_list);
    return instr;
@@ -85,6 +86,11 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
       return false;
 
    int consume_slot = gpir_instr_get_consume_slot(instr, node);
+   int non_cplx_consume_slot =
+      node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
+   int store_reduce_slot = 0;
+   int max_reduce_slot = node->sched.max_node ? 1 : 0;
+   int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
 
    /* check if this node is child of one store node.
     * complex1 won't be any of this instr's store node's child,
@@ -93,25 +99,40 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    for (int i = GPIR_INSTR_SLOT_STORE0; i <= GPIR_INSTR_SLOT_STORE3; i++) {
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
-         /* acc node may consume 2 slots, so even it's the child of a
-          * store node, it may not be inserted successfully, in which
-          * case we need a move node for it */
-         if (instr->alu_num_slot_free - consume_slot <
-             instr->alu_num_slot_needed_by_store - 1)
-            return false;
-
-         instr->alu_num_slot_needed_by_store--;
-         instr->alu_num_slot_free -= consume_slot;
-         return true;
+         store_reduce_slot = 1;
+         break;
       }
    }
 
-   /* not a child of any store node, so must reserve alu slot for store node */
-   if (instr->alu_num_slot_free - consume_slot <
-       instr->alu_num_slot_needed_by_store)
+   /* Check that the invariants will be maintained after we adjust everything
+    */
+
+   int slot_difference = 
+       instr->alu_num_slot_needed_by_store - store_reduce_slot +
+       instr->alu_num_slot_needed_by_max - max_reduce_slot +
+       MAX2(instr->alu_num_slot_needed_by_next_max - next_max_reduce_slot, 0) -
+      (instr->alu_num_slot_free - consume_slot);
+   if (slot_difference > 0) {
+      gpir_debug("failed %d because of alu slot\n", node->index);
+      instr->slot_difference = slot_difference;
+   }
+
+   int non_cplx_slot_difference =
+       instr->alu_num_slot_needed_by_max - max_reduce_slot -
+       (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
+   if (non_cplx_slot_difference > 0) {
+      gpir_debug("failed %d because of alu slot\n", node->index);
+      instr->non_cplx_slot_difference = non_cplx_slot_difference;
+   }
+
+   if (slot_difference > 0 || non_cplx_slot_difference > 0)
       return false;
 
    instr->alu_num_slot_free -= consume_slot;
+   instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
+   instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+   instr->alu_num_slot_needed_by_max -= max_reduce_slot;
+   instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
    return true;
 }
 
@@ -123,12 +144,17 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          instr->alu_num_slot_needed_by_store++;
-         instr->alu_num_slot_free += consume_slot;
-         return;
+         break;
       }
    }
 
    instr->alu_num_slot_free += consume_slot;
+   if (node->sched.pos != GPIR_INSTR_SLOT_COMPLEX)
+      instr->alu_non_cplx_slot_free += consume_slot;
+   if (node->sched.max_node)
+      instr->alu_num_slot_needed_by_max++;
+   if (node->sched.next_max_node)
+      instr->alu_num_slot_needed_by_next_max++;
 }
 
 static bool gpir_instr_insert_reg0_check(gpir_instr *instr, gpir_node *node)
@@ -269,12 +295,18 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
          goto out;
    }
 
-   /* no store node has the same child as this node, and child is not
-    * already in this instr's alu slot, so instr must have some free
-    * alu slot to insert this node's child
+   /* Check the invariants documented in gpir.h, similar to the ALU case.
+    * Since the only thing that changes is alu_num_slot_needed_by_store, we
+    * can get away with just checking the first one.
     */
-   if (gpir_instr_alu_slot_is_full(instr))
+   int slot_difference = instr->alu_num_slot_needed_by_store + 1
+      + instr->alu_num_slot_needed_by_max +
+      MAX2(instr->alu_num_slot_needed_by_next_max, 0) -
+      instr->alu_num_slot_free;
+   if (slot_difference > 0) {
+      instr->slot_difference = slot_difference;
       return false;
+   }
 
    instr->alu_num_slot_needed_by_store++;
 
@@ -299,6 +331,9 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
    int other_slot = GPIR_INSTR_SLOT_STORE0 + (component ^ 1);
 
    for (int j = GPIR_INSTR_SLOT_STORE0; j <= GPIR_INSTR_SLOT_STORE3; j++) {
+      if (j == node->sched.pos)
+         continue;
+
       gpir_store_node *s = gpir_node_to_store(instr->slots[j]);
       if (s && s->child == store->child)
          goto out;
@@ -369,6 +404,9 @@ static bool gpir_instr_slot_free(gpir_instr *instr, gpir_node *node)
 
 bool gpir_instr_try_insert_node(gpir_instr *instr, gpir_node *node)
 {
+   instr->slot_difference = 0;
+   instr->non_cplx_slot_difference = 0;
+
    if (!gpir_instr_slot_free(instr, node))
       return false;
 
@@ -413,7 +451,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
    /* This can happen if we merge duplicate loads in the scheduler. */
    if (instr->slots[node->sched.pos] != node) {
       node->sched.pos = -1;
-      node->sched.instr = -1;
+      node->sched.instr = NULL;
       return;
    }
 
@@ -439,7 +477,7 @@ void gpir_instr_remove_node(gpir_instr *instr, gpir_node *node)
       instr->slots[GPIR_INSTR_SLOT_MUL1] = NULL;
 
    node->sched.pos = -1;
-   node->sched.instr = -1;
+   node->sched.instr = NULL;
 }
 
 void gpir_instr_print_prog(gpir_compiler *comp)
diff --git a/src/gallium/drivers/lima/ir/gp/nir.c b/src/gallium/drivers/lima/ir/gp/nir.c
index 902d27a3149..d1da7ed3754 100644
--- a/src/gallium/drivers/lima/ir/gp/nir.c
+++ b/src/gallium/drivers/lima/ir/gp/nir.c
@@ -422,10 +422,7 @@ bool gpir_compile_nir(struct lima_vs_shader_state *prog, struct nir_shader *nir)
    if (!gpir_post_rsched_lower_prog(comp))
       goto err_out0;
 
-   if (!gpir_value_regalloc_prog(comp))
-      goto err_out0;
-
-   if (!gpir_physical_regalloc_prog(comp))
+   if (!gpir_regalloc_prog(comp))
       goto err_out0;
 
    if (!gpir_schedule_prog(comp))
diff --git a/src/gallium/drivers/lima/ir/gp/node.c b/src/gallium/drivers/lima/ir/gp/node.c
index eda6ae7ed6f..decda5f1246 100644
--- a/src/gallium/drivers/lima/ir/gp/node.c
+++ b/src/gallium/drivers/lima/ir/gp/node.c
@@ -436,8 +436,6 @@ static void gpir_node_print_node(gpir_node *node, int type, int space)
       [GPIR_DEP_OFFSET] = "offset",
       [GPIR_DEP_READ_AFTER_WRITE] = "RaW",
       [GPIR_DEP_WRITE_AFTER_READ] = "WaR",
-      [GPIR_DEP_VREG_READ_AFTER_WRITE] = "vRaW",
-      [GPIR_DEP_VREG_WRITE_AFTER_READ] = "vWaR",
    };
 
    for (int i = 0; i < space; i++)
diff --git a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c b/src/gallium/drivers/lima/ir/gp/physical_regalloc.c
deleted file mode 100644
index 87d88a8f9b7..00000000000
--- a/src/gallium/drivers/lima/ir/gp/physical_regalloc.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2017 Lima Project
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sub license,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include <limits.h>
-
-#include "gpir.h"
-
-/* Linear scan register alloc for physical reg alloc of each
- * load/store node
- */
-
-static void regalloc_print_result(gpir_compiler *comp)
-{
-   if (!(lima_debug & LIMA_DEBUG_GP))
-      return;
-
-   int index = 0;
-   printf("======== physical regalloc ========\n");
-   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         if (node->op == gpir_op_load_reg) {
-            gpir_load_node *load = gpir_node_to_load(node);
-            printf("%03d: load %d use reg %d\n", index, node->index, load->reg->index);
-         }
-         else if (node->op == gpir_op_store_reg) {
-            gpir_store_node *store = gpir_node_to_store(node);
-            printf("%03d: store %d use reg %d\n", index, node->index, store->reg->index);
-         }
-         index++;
-      }
-      printf("----------------------------\n");
-   }
-}
-
-bool gpir_physical_regalloc_prog(gpir_compiler *comp)
-{
-   int index = 0;
-   list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         node->preg.index = index++;
-      }
-   }
-
-   /* calculate each reg liveness interval */
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      reg->start = INT_MAX;
-      list_for_each_entry(gpir_store_node, store, &reg->defs_list, reg_link) {
-         if (store->node.preg.index < reg->start)
-            reg->start = store->node.preg.index;
-      }
-
-      reg->end = 0;
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         if (load->node.preg.index > reg->end)
-            reg->end = load->node.preg.index;
-      }
-   }
-
-   /* sort reg list by start value */
-   struct list_head reg_list;
-   list_replace(&comp->reg_list, &reg_list);
-   list_inithead(&comp->reg_list);
-   list_for_each_entry_safe(gpir_reg, reg, &reg_list, list) {
-      struct list_head *insert_pos = &comp->reg_list;
-      list_for_each_entry(gpir_reg, creg, &comp->reg_list, list) {
-         if (creg->start > reg->start) {
-            insert_pos = &creg->list;
-            break;
-         }
-      }
-      list_del(&reg->list);
-      list_addtail(&reg->list, insert_pos);
-   }
-
-   /* do linear scan reg alloc */
-   gpir_reg *active[GPIR_PHYSICAL_REG_NUM] = {0};
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      int i;
-
-      /* if some reg is expired */
-      for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
-         if (active[i] && active[i]->end <= reg->start)
-            active[i] = NULL;
-      }
-
-      /* find a free reg value for this reg */
-      for (i = 0; i < GPIR_PHYSICAL_REG_NUM; i++) {
-         if (!active[i]) {
-            active[i] = reg;
-            reg->index = i;
-            break;
-         }
-      }
-
-      /* TODO: support spill to temp memory */
-      assert(i < GPIR_PHYSICAL_REG_NUM);
-   }
-
-   /* update load/store node info for the real reg */
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      list_for_each_entry(gpir_store_node, store, &reg->defs_list, reg_link) {
-         store->index = reg->index >> 2;
-         store->component = reg->index % 4;
-      }
-
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         load->index = reg->index >> 2;
-         load->index = reg->index % 4;
-      }
-   }
-
-   regalloc_print_result(comp);
-   return true;
-}
diff --git a/src/gallium/drivers/lima/ir/gp/value_regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c
index f633b949932..c145bfbee81 100644
--- a/src/gallium/drivers/lima/ir/gp/value_regalloc.c
+++ b/src/gallium/drivers/lima/ir/gp/regalloc.c
@@ -24,60 +24,17 @@
 
 #include "gpir.h"
 
-/* Linear scan register alloc for value reg alloc of each node */
-
-static int regalloc_spill_active_node(gpir_node *active[])
-{
-   gpir_node *spill = NULL;
-   for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) {
-      if (gpir_op_infos[active[i]->op].spillless)
-         continue;
-
-      /* spill farest node */
-      if (!spill ||
-          spill->vreg.last->vreg.index < active[i]->vreg.last->vreg.index) {
-         spill = active[i];
-      }
-   }
-
-   assert(spill);
-   gpir_debug("value regalloc spill node %d for value reg %d\n",
-              spill->index, spill->value_reg);
-
-   /* create store node for spilled node */
-   gpir_store_node *store = gpir_node_create(spill->block, gpir_op_store_reg);
-   store->child = spill;
-   /* no need to calculate other vreg values because store & spill won't
-    * be used in the following schedule again */
-   store->node.value_reg = spill->value_reg;
-   list_addtail(&store->node.list, &spill->list);
-
-   gpir_reg *reg = gpir_create_reg(spill->block->comp);
-   store->reg = reg;
-   list_addtail(&store->reg_link, &reg->defs_list);
-
-   gpir_node_foreach_succ_safe(spill, dep) {
-      gpir_node *succ = dep->succ;
-      gpir_load_node *load = gpir_node_create(succ->block, gpir_op_load_reg);
-      gpir_node_replace_pred(dep, &load->node);
-      gpir_node_replace_child(succ, spill, &load->node);
-      list_addtail(&load->node.list, &succ->list);
-
-      /* only valid for succ already scheduled, succ not scheduled will
-       * re-write this value */
-      load->node.value_reg = spill->value_reg;
-      load->node.vreg.index =
-         (list_first_entry(&load->node.list, gpir_node, list)->vreg.index +
-          list_last_entry(&load->node.list, gpir_node, list)->vreg.index) / 2.0f;
-      load->node.vreg.last = succ;
-
-      load->reg = reg;
-      list_addtail(&load->reg_link, &reg->uses_list);
-   }
-
-   gpir_node_add_dep(&store->node, spill, GPIR_DEP_INPUT);
-   return spill->value_reg;
-}
+/* Register allocation
+ *
+ * TODO: This needs to be rewritten when we support multiple basic blocks. We
+ * need to do proper liveness analysis, combined with either linear scan,
+ * graph coloring, or SSA-based allocation. We should also support spilling to
+ * temporaries.
+ *
+ * For now, this only assigns fake registers to values, used to build the fake
+ * dependencies that the scheduler relies on. In the future we should also be
+ * assigning actual physreg numbers to load_reg/store_reg nodes.
+ */
 
 static void regalloc_block(gpir_block *block)
 {
@@ -99,7 +56,7 @@ static void regalloc_block(gpir_block *block)
 
    /* do linear scan regalloc */
    int reg_search_start = 0;
-   gpir_node *active[GPIR_VALUE_REG_NUM] = {0};
+   gpir_node *active[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
    list_for_each_entry(gpir_node, node, &block->node_list, list) {
       /* if some reg is expired */
       gpir_node_foreach_pred(node, dep) {
@@ -116,9 +73,9 @@ static void regalloc_block(gpir_block *block)
 
       /* find a free reg for this node */
       int i;
-      for (i = 0; i < GPIR_VALUE_REG_NUM; i++) {
+      for (i = 0; i < GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; i++) {
          /* round robin reg select to reduce false dep when schedule */
-         int reg = (reg_search_start + i) % GPIR_VALUE_REG_NUM;
+         int reg = (reg_search_start + i) % (GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
          if (!active[reg]) {
             active[reg] = node;
             node->value_reg = reg;
@@ -127,14 +84,8 @@ static void regalloc_block(gpir_block *block)
          }
       }
 
-      /* need spill */
-      if (i == GPIR_VALUE_REG_NUM) {
-         int spilled_reg = regalloc_spill_active_node(active);
-         active[spilled_reg] = node;
-         node->value_reg = spilled_reg;
-         gpir_debug("value regalloc node %d reuse reg %d\n",
-                    node->index, spilled_reg);
-      }
+      /* TODO: spill */
+      assert(i != GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM);
    }
 }
 
@@ -144,7 +95,7 @@ static void regalloc_print_result(gpir_compiler *comp)
       return;
 
    int index = 0;
-   printf("======== value regalloc ========\n");
+   printf("======== regalloc ========\n");
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       list_for_each_entry(gpir_node, node, &block->node_list, list) {
          printf("%03d: %d/%d %s ", index++, node->index, node->value_reg,
@@ -159,7 +110,7 @@ static void regalloc_print_result(gpir_compiler *comp)
    }
 }
 
-bool gpir_value_regalloc_prog(gpir_compiler *comp)
+bool gpir_regalloc_prog(gpir_compiler *comp)
 {
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       regalloc_block(block);
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 3d7e640ada2..60076daa3c4 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -27,58 +27,196 @@
 #include "gpir.h"
 
 /*
- * GP schedule algorithm (by Connor Abbott <[email protected]>)
+ * GP scheduling algorithm (by Connor Abbott <[email protected]>)
+ * 
+ * The GP pipeline has three main stages:
  *
- * Pre schedule phase:
- * 1. order all nodes in a sequence
- * 2. convert the real reg read/write to GP load/store node, now all
- *    variable is SSA
- * 3. do reg alloc for all SSA with 11 reg (value reg) and spill with
- *    load/store to real reg if needed
- * 4. add fake dependency like this:
- *    after step 3, node sequence is
- *      01: r1=r2+r3
- *      02: r4=r1+r2
- *      03: r1=r5+r6
- *    we should add a fake dependency of node 3 to node 2 like a
- *    write-after-read dep. But this is not really write-after-read
- *    dep because there's no r1 really, because it's a value register.
- *    We need this fake dep in the schedule phase to make sure in any
- *    schedule point, there're only <=11 input needed by the past
- *    scheduled nodes.
- * 5. build DAG according to all the real and fake dep
+ * --------------------------------------------------------
+ * |                                                      |
+ * |               Register/Attr/Temp Fetch               |
+ * |                                                      |
+ * --------------------------------------------------------
+ * |        |        |        |        |        |         |
+ * |  Mul0  |  Mul1  |  Add0  |  Add1  |  Cplx  |  Pass   |
+ * |        |        |        |        |        |         |
+ * --------------------------------------------------------
+ * |                 |                          |         |
+ * |    Complex1     |   Temp/Register/Varying  |  Pass   |
+ * |    Stage 2      |         Store            | Stage 2 |
+ * |                 |                          |         |
+ * --------------------------------------------------------
  *
- * Schedule phase:
- * 1. Compute the nodes ready to schedule, if no nodes, exit
- * 2. Create a new GP instruction, and call it as current instr
- * 3. For any nodes with a use 2 cycles ago with a definition ready to
- *    schedule, schedule that definition immediately if possible, or else
- *    schedule a move.
- * 4. For any nodes with a use 2 cycles ago but the definition not
- *    scheduled and not ready to schedule, schedule a move immediately
- *    to prevent the value from falling off the queue.
- * 5. Calculate the number of remaining nodes with a use 1 cycle ago but
- *    the definition not yet scheduled, and if there are more than 5,
- *    schedule moves or definitions for the rest now.
- * 6. Schedule the rest of the available nodes using your favorite heuristic
- *    to current instr.
- * 7. go to step 1
+ * Because of this setup, storing a register has a latency of three cycles.
+ * Also, the register file is organized into 4-component vectors, and the
+ * load stage can only load two vectors at a time. Aside from these highly
+ * constrained register load/store units, there is an explicit bypass
+ * network, where each unit (mul0/mul1/etc.) can access the results of the
+ * any unit from the previous two cycles directly, except for the complex
+ * unit whose result can only be accessed for one cycle (since it's expected
+ * to be used directly by the complex2 instruction in the following cycle).
  *
- * Step 5 for the current instruction guarantees that steps 3 and 4 for
- * the next instruction will always succeed, so it's only step 5 that can
- * possibly fail. Now, note that the nodes whose definitions have not yet
- * been scheduled but one or more use has been scheduled, are exactly the
- * nodes that are live in the final schedule. Therefore there will never
- * be more than 11 of them (guarenteed by the 11 value reg alloc and the
- * fake dep added before schedule). The worst case for step 5 is that all of
- * these nodes had a use 1 cycle ago, which means that none of them hit
- * case 3 or 4 already, so there are 6 slots still available so step 5
- * will always succeed. In general, even if there are exactly 11 values
- * live, if n are scheduled in steps 3 and 4, there are 11-n left in step
- * 4 so at most 11-n-5 = 6-n are scheduled in step 5 and therefore 6 are
- * scheduled total, below the limit. So the algorithm will always succeed.
+ * Because of the very restricted register file, and because only rarely are
+ * all the units in use at the same time, it can be very beneficial to use
+ * the unused units to "thread" a value from source to destination by using
+ * moves in the otherwise-unused units, without involving the register file
+ * at all. It's very difficult to fully exploit this with a traditional
+ * scheduler, so we need to do something a little un-traditional. The 512
+ * instruction limit means that for more complex shaders, we need to do as
+ * well as possible or else the app won't even work.
+ *
+ * The scheduler works by considering the bypass network as a kind of
+ * register file. It's a quite unusual register file, since registers have to
+ * be assigned "on the fly" as we schedule operations, but with some care, we
+ * can use something conceptually similar to a linear-scan allocator to
+ * successfully schedule nodes to instructions without running into
+ * conflicts.
+ *
+ * Values in the IR are separated into normal values, or "value registers",
+ * which is what normal nodes like add, mul, etc. produce, and which only
+ * live inside one basic block, and registers, which can span multiple basic
+ * blocks but have to be accessed via special load_reg/store_reg nodes. RA
+ * assigns physical registers to both value registers and normal registers,
+ * treating load_reg/store_reg as a move instruction, but these are only used
+ * directly for normal registers -- the physreg assigned to a value register
+ * is "fake," and is only used inside the scheduler. Before scheduling we
+ * insert read-after-write dependencies, even for value registers, as if
+ * we're going to use those, but then we throw them away. For example, if we
+ * had something like:
+ *
+ * (*)r2 = add (*)r1, (*)r2
+ * (*)r1 = load_reg r0
+ *
+ * we'd insert a write-after-read dependency between the add and load_reg,
+ * even though the starred registers aren't actually used by the scheduler
+ * after this step. This step is crucial since it guarantees that during any
+ * point in the schedule, the number of live registers + live value registers
+ * will never exceed the capacity of the register file and the bypass network
+ * combined. This is because each live register/value register will have a
+ * different fake number, thanks to the fake dependencies inserted before
+ * scheduling. This allows us to not have to worry about spilling to
+ * temporaries, which is only done ahead of time.
+ *
+ * The scheduler is a bottom-up scheduler. It keeps track of each live value
+ * register, and decides on-the-fly which value registers to keep in the
+ * bypass network and which to "spill" to registers. Of particular importance
+ * is the "ready list," which consists of "input nodes" (nodes that produce a
+ * value that can be consumed via the bypass network), both "partially ready"
+ * (only some of the uses have been scheduled) and "fully ready" (all uses
+ * have been scheduled), as well as other non-input nodes like register
+ * stores. Each input node on the ready list represents a live value register
+ * before the current instruction. There must be at most 11 such input nodes
+ * at all times, since there are only 11 slots in the next two instructions
+ * which can reach the current instruction.
+ *
+ * An input node is a "max node" if it has a use two cycles ago, which must be
+ * connected to a definition this cycle. Otherwise it may be a "next max node"
+ * if it will be a max node on the next instruction (i.e. it has a use at most
+ * one cycle ago), or it may be neither if all of its uses are this cycle. As
+ * we keep adding instructions to the front, input nodes graduate from
+ * neither, to next max, to max, unless we decide to insert a move to keep it
+ * alive longer, at which point any uses after the current instruction are
+ * rewritten to be uses of the move so that the original node returns to
+ * neither. The scheduler decides which nodes to try freely, but we have to
+ * reserve slots for two different reasons: (1) out of the 5 non-complex
+ * slots, we reserve a slot for each max node, so that we can connect a
+ * definition to the use 2 cycles ago. (2) Out of all 6 slots, we reserve a
+ * slot for every next-max node above 5, so that for the next instruction
+ * there are no more than 5 max nodes. When a max or next-max node gets
+ * scheduled, the corresponding reservation is reduced by one. At the end, we
+ * insert moves for every slot that was reserved. The reservation is actually
+ * managed by nir_instr, and all we have to do is tell it how many to reserve
+ * at the beginning and then tell it which nodes are max/next-max nodes. When
+ * we start scheduling an instruction, there will be at most 5 max nodes
+ * thanks to the previous instruction's next-max reservation/move insertion.
+ * Since there are at most 11 total input nodes, if there are N max nodes,
+ * there are at most 11 - N next-max nodes, and therefore at most 11 - N - 5 =
+ * 6 - N slots need to be reserved for next-max nodes, and so at most
+ * 6 - N + N = 6 slots need to be reserved in total, exactly the total number
+ * of slots. So, thanks to the total input node restriction, we will never
+ * need to reserve too many slots.
+ *
+ * It sometimes happens that scheduling a given node will violate this total
+ * input node restriction, or that a reservation will mean that we can't
+ * schedule it. We first schedule a node "speculatively" to see if this is a
+ * problem. If some of the node's sources are loads, then we can schedule
+ * the node and its dependent loads in one swoop to avoid going over the
+ * pressure limit. If that fails, we can try to spill a ready or
+ * partially-ready input node to a register by rewriting all of its uses to
+ * refer to a register load. This removes it from the list of ready and
+ * partially ready input nodes as all of its uses are now unscheduled. If
+ * successful, we can then proceed with scheduling the original node. All of
+ * this happens "speculatively," meaning that afterwards the node is removed
+ * and the entire state of the scheduler is reverted to before it was tried, to
+ * ensure that we never get into an invalid state and run out of spots for
+ * moves. In try_nodes(), we try to schedule each node speculatively on the
+ * ready list, keeping only the nodes that could be successfully scheduled, so
+ * that when we finally decide which node to actually schedule, we know it
+ * will succeed.  This is how we decide on the fly which values go in
+ * registers and which go in the bypass network. Note that "unspilling" a
+ * value is simply a matter of scheduling the store_reg instruction created
+ * when we spill.
+ *
+ * The careful accounting of live value registers, reservations for moves, and
+ * speculative scheduling guarantee that we never run into a failure case
+ * while scheduling. However, we need to make sure that this scheduler will
+ * not get stuck in an infinite loop, i.e. that we'll always make forward
+ * progress by eventually scheduling a non-move node. If we run out of value
+ * registers, then we may have to spill a node to a register. If we
+ * were to schedule one of the fully-ready nodes, then we'd have 11 + N live
+ * value registers before the current instruction. But since there are at most
+ * 64+11 live registers and register values total thanks to the fake
+ * dependencies we inserted before scheduling, there are at most 64 - N live
+ * physical registers, and therefore there are at least N registers available
+ * for spilling. Not all these registers will be available immediately, since
+ * in order to spill a node to a given register we have to ensure that there
+ * are slots available to rewrite every use to a load instruction, and that
+ * may not be the case. There may also be intervening writes which prevent
+ * some registers from being used. However, these are all temporary problems,
+ * since as we create each instruction, we create additional register load
+ * slots that can be freely used for spilling, and we create more move nodes
+ * which means that the uses of the nodes we're trying to spill keep moving
+ * forward. This means that eventually, these problems will go away, at which
+ * point we'll be able to spill a node successfully, so eventually we'll be
+ * able to schedule the first node on the ready list.
  */
 
+typedef struct {
+   /* This is the list of ready and partially-ready nodes. A partially-ready
+    * node must have at least one input dependency already scheduled.
+    */
+   struct list_head ready_list;
+
+   /* The number of ready or partially-ready nodes with at least one input
+    * dependency already scheduled. In other words, the number of live value
+    * registers. This must be at most 11.
+    */
+   int ready_list_slots;
+
+   /* The physical registers live into the current instruction. */
+   uint64_t live_physregs;
+
+   /* The current instruction. */
+   gpir_instr *instr;
+
+   /* The current basic block. */
+   gpir_block *block;
+
+   /* True if at least one node failed to schedule due to lack of available
+    * value registers.
+    */
+   bool try_spill_all;
+
+   /* The number of max nodes needed to spill to successfully schedule the
+    * instruction.
+    */
+   int max_node_spill_needed;
+
+   /* The number of max and next-max nodes needed to spill to successfully
+    * schedule the instruction.
+    */
+   int total_spill_needed;
+} sched_ctx;
+
 static int gpir_min_dist_alu(gpir_dep *dep)
 {
    switch (dep->pred->op) {
@@ -104,8 +242,13 @@ static int gpir_get_min_dist(gpir_dep *dep)
       case gpir_op_store_temp:
       case gpir_op_store_reg:
       case gpir_op_store_varying:
-         /* store must use alu node as input */
-         if (dep->pred->type == gpir_node_type_load)
+         /* Stores must use an alu node as input. Also, complex1 takes two
+          * cycles, which means that its result cannot be stored to a register
+          * as part of the normal path, and therefore it must also have a move
+          * inserted.
+          */
+         if (dep->pred->type == gpir_node_type_load ||
+             dep->pred->op == gpir_op_complex1)
             return INT_MAX >> 2;
          else
             return 0;
@@ -119,44 +262,24 @@ static int gpir_get_min_dist(gpir_dep *dep)
       return gpir_min_dist_alu(dep);
 
    case GPIR_DEP_READ_AFTER_WRITE:
-      switch (dep->succ->op) {
-      case gpir_op_load_temp:
-         assert(dep->pred->op == gpir_op_store_temp);
+      if (dep->succ->op == gpir_op_load_temp &&
+          dep->pred->op == gpir_op_store_temp) {
          return 4;
-      case gpir_op_load_reg:
-         assert(dep->pred->op == gpir_op_store_reg);
+      } else if (dep->succ->op == gpir_op_load_reg &&
+                 dep->pred->op == gpir_op_store_reg) {
          return 3;
-      case gpir_op_load_uniform:
-         assert(dep->pred->op == gpir_op_store_temp_load_off0 ||
-                dep->pred->op == gpir_op_store_temp_load_off1 ||
-                dep->pred->op == gpir_op_store_temp_load_off2);
+      } else if ((dep->pred->op == gpir_op_store_temp_load_off0 ||
+                  dep->pred->op == gpir_op_store_temp_load_off1 ||
+                  dep->pred->op == gpir_op_store_temp_load_off2) &&
+                 dep->succ->op == gpir_op_load_uniform) {
          return 4;
-      default:
-         assert(0);
+      } else {
+         /* Fake dependency */
+         return 0;
       }
 
    case GPIR_DEP_WRITE_AFTER_READ:
-      switch (dep->pred->op) {
-      case gpir_op_load_temp:
-         assert(dep->succ->op == gpir_op_store_temp);
-         return -3;
-      case gpir_op_load_reg:
-         assert(dep->succ->op == gpir_op_store_reg);
-         return -2;
-      case gpir_op_load_uniform:
-         assert(dep->succ->op == gpir_op_store_temp_load_off0 ||
-                dep->succ->op == gpir_op_store_temp_load_off1 ||
-                dep->succ->op == gpir_op_store_temp_load_off2);
-         return -3;
-      default:
-         assert(0);
-      }
-
-   case GPIR_DEP_VREG_WRITE_AFTER_READ:
       return 0;
-
-   case GPIR_DEP_VREG_READ_AFTER_WRITE:
-      assert(0); /* not possible, this is GPIR_DEP_INPUT */
    }
 
    return 0;
@@ -230,13 +353,64 @@ static void schedule_update_distance(gpir_node *node)
       if (pred->sched.dist < 0)
          schedule_update_distance(pred);
 
-      int dist = pred->sched.dist + 1;
+      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
       if (node->sched.dist < dist)
          node->sched.dist = dist;
    }
 }
 
-static void schedule_insert_ready_list(struct list_head *ready_list,
+static bool gpir_is_input_node(gpir_node *node)
+{
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type == GPIR_DEP_INPUT)
+         return true;
+   }
+   return false;
+}
+
+
+/* Get the number of slots required for a node on the ready list.
+ */
+static int gpir_get_slots_required(gpir_node *node)
+{
+   if (!gpir_is_input_node(node))
+      return 0;
+
+   /* Note that we assume every node only consumes one slot, even dual-slot
+    * instructions. While dual-slot instructions may consume more than one
+    * slot, we can always safely insert a move if it turns out that there
+    * isn't enough space for them. There's the risk that we get stuck in an
+    * infinite loop if all the fully ready nodes are dual-slot nodes, but we
+    * rely on spilling to registers to save us here.
+    */
+   return 1;
+}
+
+static void verify_ready_list(sched_ctx *ctx)
+{
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node)) {
+         assert(node->sched.ready);
+      }
+
+      if (node->sched.ready) {
+         /* Every successor must have been scheduled */
+         gpir_node_foreach_succ(node, dep) {
+            assert(dep->succ->sched.instr);
+         }
+      } else {
+         /* There must be at least one successor that's not scheduled. */
+         bool unscheduled = false;
+         gpir_node_foreach_succ(node, dep) {
+            unscheduled |= !(dep->succ->sched.instr);
+         }
+
+         assert(unscheduled);
+      }
+   }
+}
+
+static void schedule_insert_ready_list(sched_ctx *ctx,
                                        gpir_node *insert_node)
 {
    /* if this node is fully ready or partially ready
@@ -250,7 +424,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
    bool ready = true, insert = false;
    gpir_node_foreach_succ(insert_node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr >= 0) {
+      if (succ->sched.instr) {
          if (dep->type == GPIR_DEP_INPUT)
             insert = true;
       }
@@ -265,8 +439,8 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
    if (!insert || insert_node->sched.inserted)
       return;
 
-   struct list_head *insert_pos = ready_list;
-   list_for_each_entry(gpir_node, node, ready_list, list) {
+   struct list_head *insert_pos = &ctx->ready_list;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (insert_node->sched.dist > node->sched.dist) {
          insert_pos = &node->list;
          break;
@@ -275,6 +449,7 @@ static void schedule_insert_ready_list(struct list_head *ready_list,
 
    list_addtail(&insert_node->list, insert_pos);
    insert_node->sched.inserted = true;
+   ctx->ready_list_slots += gpir_get_slots_required(insert_node);
 }
 
 static int gpir_get_max_start(gpir_node *node)
@@ -284,10 +459,10 @@ static int gpir_get_max_start(gpir_node *node)
    /* find the max start instr constrainted by all successors */
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr < 0)
+      if (!succ->sched.instr)
          continue;
 
-      int start = succ->sched.instr + gpir_get_min_dist(dep);
+      int start = succ->sched.instr->index + gpir_get_min_dist(dep);
       if (start > max_start)
          max_start = start;
    }
@@ -302,10 +477,10 @@ static int gpir_get_min_end(gpir_node *node)
    /* find the min end instr constrainted by all successors */
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr < 0)
+      if (!succ->sched.instr)
          continue;
 
-      int end = succ->sched.instr + gpir_get_max_dist(dep);
+      int end = succ->sched.instr->index + gpir_get_max_dist(dep);
       if (end < min_end)
          min_end = end;
    }
@@ -330,11 +505,24 @@ static gpir_node *gpir_sched_instr_has_load(gpir_instr *instr, gpir_node *node)
    return NULL;
 }
 
-static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node)
+/* Simply place the node into the given instruction without trying to deal
+ * with liveness or the ready list. This will only fail if the instruction
+ * cannot be placed due to a lack of available slots. In addition to normal
+ * node placement, this is also used for placing loads when spilling to
+ * registers.
+ */
+static bool _try_place_node(sched_ctx *ctx, gpir_instr *instr, gpir_node *node)
 {
    if (node->type == gpir_node_type_load) {
       gpir_node *load = gpir_sched_instr_has_load(instr, node);
       if (load) {
+         /* This node may have a store as a successor, in which case we have to
+          * fail it exactly like below in order to later create a move node in
+          * between.
+          */
+         if (instr->index < gpir_get_max_start(node))
+            return false;
+
          gpir_debug("same load %d in instr %d for node %d\n",
                     load->index, instr->index, node->index);
 
@@ -345,23 +533,100 @@ static bool schedule_try_place_node(gpir_instr *instr, gpir_node *node)
       }
    }
 
-   node->sched.instr = instr->index;
+   node->sched.instr = instr;
 
+   int max_node_spill_needed = INT_MAX;
+   int total_spill_needed = INT_MAX;
    int *slots = gpir_op_infos[node->op].slots;
    for (int i = 0; slots[i] != GPIR_INSTR_SLOT_END; i++) {
       node->sched.pos = slots[i];
-      if (node->sched.instr >= gpir_get_max_start(node) &&
-          node->sched.instr <= gpir_get_min_end(node) &&
+      if (instr->index >= gpir_get_max_start(node) &&
+          instr->index <= gpir_get_min_end(node) &&
           gpir_instr_try_insert_node(instr, node))
          return true;
+      if (ctx->instr->non_cplx_slot_difference ||
+          ctx->instr->slot_difference) {
+         /* If one of these fields is non-zero, then we could insert the node
+          * here after spilling. To get an accurate count of how many nodes we
+          * need to spill, we need to choose one of the positions where there
+          * were nonzero slot differences, preferably one with the smallest
+          * difference (so we don't have to spill as much).
+          */
+         if (ctx->instr->non_cplx_slot_difference < max_node_spill_needed ||
+             ctx->instr->slot_difference < total_spill_needed) {
+            max_node_spill_needed = ctx->instr->non_cplx_slot_difference;
+            total_spill_needed = ctx->instr->slot_difference;
+         }
+      }
    }
 
-   node->sched.instr = -1;
+   if (max_node_spill_needed != INT_MAX) {
+      /* Indicate how many spill nodes are needed. */
+      ctx->max_node_spill_needed = MAX2(ctx->max_node_spill_needed,
+                                        max_node_spill_needed);
+      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
+                                     total_spill_needed);
+   }
+   node->sched.instr = NULL;
    node->sched.pos = -1;
    return false;
 }
 
-static gpir_node *schedule_create_move_node(gpir_node *node)
+/* Try to place just the node given, updating the ready list. If "speculative"
+ * is true, then this is part ofthe pre-commit phase. If false, then we have
+ * committed to placing this node, so update liveness and ready list
+ * information.
+ */
+
+static bool schedule_try_place_node(sched_ctx *ctx, gpir_node *node,
+                                    bool speculative)
+{
+   if (!_try_place_node(ctx, ctx->instr, node)) {
+      if (!speculative)
+         gpir_debug("failed to place %d\n", node->index);
+      return false;
+   }
+
+   ctx->ready_list_slots -= gpir_get_slots_required(node);
+
+   if (!speculative) {
+      gpir_debug("placed node %d\n", node->index);
+
+      /* We assume here that writes are placed before reads. If this changes,
+       * then this needs to be updated.
+       */
+      if (node->op == gpir_op_store_reg) {
+         gpir_store_node *store = gpir_node_to_store(node);
+         ctx->live_physregs &=
+            ~(1ull << (4 * store->index + store->component));
+         if (store->child->sched.physreg_store == store)
+            store->child->sched.physreg_store = NULL;
+      }
+
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         ctx->live_physregs |= 
+            (1ull << (4 * load->index + load->component));
+      }
+
+      list_del(&node->list);
+      list_add(&node->list, &ctx->block->node_list);
+      gpir_node_foreach_pred(node, dep) {
+         gpir_node *pred = dep->pred;
+         schedule_insert_ready_list(ctx, pred);
+      }
+   } else {
+      gpir_node_foreach_pred(node, dep) {
+         gpir_node *pred = dep->pred;
+         if (!pred->sched.inserted && dep->type == GPIR_DEP_INPUT)
+            ctx->ready_list_slots += gpir_get_slots_required(pred);
+      }
+   }
+
+   return true;
+}
+
+static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
 {
    gpir_alu_node *move = gpir_node_create(node->block, gpir_op_mov);
    if (unlikely(!move))
@@ -370,187 +635,712 @@ static gpir_node *schedule_create_move_node(gpir_node *node)
    move->children[0] = node;
    move->num_child = 1;
 
-   move->node.sched.instr = -1;
+   move->node.sched.instr = NULL;
    move->node.sched.pos = -1;
    move->node.sched.dist = node->sched.dist;
+   move->node.sched.max_node = node->sched.max_node;
+   move->node.sched.next_max_node = node->sched.next_max_node;
 
    gpir_debug("create move %d for %d\n", move->node.index, node->index);
+
+   ctx->ready_list_slots--;
+   list_del(&node->list);
+   node->sched.max_node = false;
+   node->sched.next_max_node = false;
+   node->sched.ready = false;
+   node->sched.inserted = false;
+   gpir_node_replace_succ(&move->node, node);
+   gpir_node_add_dep(&move->node, node, GPIR_DEP_INPUT);
+   schedule_insert_ready_list(ctx, &move->node);
    return &move->node;
 }
 
-static gpir_node *gpir_sched_node(gpir_instr *instr, gpir_node *node)
+
+/* Once we schedule the successor, would the predecessor be fully ready? */
+static bool pred_almost_ready(gpir_dep *dep)
 {
-   if (node->op == gpir_op_mov) {
-      gpir_node *child = gpir_node_to_alu(node)->children[0];
-      gpir_node_foreach_succ_safe(node, dep) {
-         gpir_node *succ = dep->succ;
-         if (succ->sched.instr < 0 ||
-             instr->index < succ->sched.instr + gpir_get_min_dist(dep)) {
-            gpir_node_replace_pred(dep, child);
-            if (dep->type == GPIR_DEP_INPUT)
-               gpir_node_replace_child(succ, node, child);
+   bool fully_ready = true;
+   gpir_node_foreach_succ(dep->pred, other_dep) {
+      gpir_node *succ = other_dep->succ;
+      if (!succ->sched.instr && dep->succ != other_dep->succ) {
+         fully_ready = false;
+         break;
+      }
+   }
+
+   return fully_ready;
+}
+
+/* Recursively try to schedule a node and all its dependent nodes that can fit
+ * in the same  instruction. There is a simple heuristic scoring system to try
+ * to group together nodes that load different components of the same input,
+ * to avoid bottlenecking for operations like matrix multiplies that are
+ * mostly input-bound.
+ */
+
+static int _schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
+{
+   if (!schedule_try_place_node(ctx, node, speculative))
+      return INT_MIN;
+
+   int score = 0;
+
+   gpir_node_foreach_pred(node, dep) {
+      if (!gpir_is_input_node(dep->pred))
+         continue;
+
+      int pred_score = INT_MIN;
+      if (pred_almost_ready(dep)) {
+         if (dep->pred->type == gpir_node_type_load || 
+             node->type == gpir_node_type_store) {
+            pred_score = _schedule_try_node(ctx, dep->pred, speculative);
+         }
+      }
+      if (dep->pred->type == gpir_node_type_load || 
+          node->type == gpir_node_type_store) {
+         if (pred_score == INT_MIN) {
+            if (node->op == gpir_op_mov) {
+               /* The only moves on the ready list are for loads that we
+                * couldn't schedule immediately, as created below. If we
+                * couldn't schedule the load, there's no point scheduling
+                * the move. The normal move threading logic will ensure
+                * that another move is created if we're about to go too far
+                * from the uses of this move.
+                */
+               assert(speculative);
+               return INT_MIN;
+            } else if (!speculative && dep->pred->type == gpir_node_type_load) {
+               /* We couldn't schedule the load right away, so it will have
+                * to happen in some earlier instruction and then be moved
+                * into a value register and threaded to the use by "node".
+                * We create the move right away, so that later we'll fail
+                * to schedule it if there isn't a slot for a move
+                * available.
+                */
+               create_move(ctx, dep->pred);
+            }
+            /* Penalize nodes whose dependent ops we couldn't schedule.
+             */
+            score--;
+         } else {
+            score += pred_score;
+            continue;
          }
       }
-      MAYBE_UNUSED bool result = schedule_try_place_node(instr, node);
-      assert(result);
-      return node;
    }
-   else {
-      gpir_node *move = schedule_create_move_node(node);
+
+   return score;
+}
+
+/* If we speculatively tried a node, undo everything.
+ */
+
+static void schedule_undo_node(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_instr_remove_node(ctx->instr, node);
+
+   gpir_node_foreach_pred(node, dep) {
+      gpir_node *pred = dep->pred;
+      if (pred->sched.instr) {
+         schedule_undo_node(ctx, pred);
+      }
+   }
+}
+
+/* Try to schedule a node. We also try to schedule any predecessors that can
+ * be part of the same instruction. If "speculative" is true, then we don't
+ * actually change any state, only returning the score were the node to be
+ * scheduled, with INT_MIN meaning "cannot be scheduled at all".
+ */
+static int schedule_try_node(sched_ctx *ctx, gpir_node *node, bool speculative)
+{
+   int prev_slots = ctx->ready_list_slots;
+
+   int score = _schedule_try_node(ctx, node, speculative);
+
+   if (ctx->ready_list_slots > GPIR_VALUE_REG_NUM) {
+      assert(speculative);
+      ctx->total_spill_needed = MAX2(ctx->total_spill_needed,
+                                     ctx->ready_list_slots - GPIR_VALUE_REG_NUM);
+      score = INT_MIN;
+   }
+
+   if (speculative) {
+      ctx->ready_list_slots = prev_slots;
+      if (node->sched.instr)
+         schedule_undo_node(ctx, node);
+   }
+
+   return score;
+}
+
+/* This is called when we want to spill "node" by inserting loads at its uses.
+ * It returns all the possible registers we can use so that all the loads will
+ * successfully be inserted. Also return the first instruction we'll need to
+ * insert a load for.
+ */
+
+static uint64_t get_available_regs(sched_ctx *ctx, gpir_node *node,
+                                   int *min_index)
+{
+   uint64_t available = ~0ull;
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *use = dep->succ;
+      gpir_instr *instr = use->sched.instr;
+
+      if (!instr) {
+         /* This use isn't scheduled, so no need to spill it. */
+         continue;
+      }
+
+      if (use->type == gpir_node_type_store) {
+         /* We're trying to spill something that was recently stored... just
+          * bail out.
+          */
+         return 0;
+      }
+
+      if (use->op == gpir_op_mov && instr == ctx->instr) {
+         /* We try to spill the sources of this move, so we can free up space
+          * in the current instruction.
+          *
+          * TODO: should we go back further? It might let us schedule the
+          * write earlier in some cases, but then we might fail to spill.
+          */
+         available &= get_available_regs(ctx, use, min_index);
+      } else {
+         if (instr->index < *min_index)
+            *min_index = instr->index;
+
+         uint64_t use_available = 0;
+
+         if (instr->reg0_use_count == 0)
+            use_available = ~0ull;
+         else if (!instr->reg0_is_attr)
+            use_available = 0xf << (4 * instr->reg0_index);
+
+         if (instr->reg1_use_count == 0)
+            use_available = ~0ull;
+         else
+            use_available |= 0xf << (4 * instr->reg1_index);
+
+         available &= use_available;
+      }
+   }
+
+   return available;
+}
+
+/* Using "min_index" returned by get_available_regs(), figure out which
+ * registers are killed by a write after or during the current instruction and
+ * hence we can't use for spilling. Writes that haven't been scheduled yet
+ * should be reflected in live_physregs.
+ */
+
+static uint64_t get_killed_regs(sched_ctx *ctx, int min_index)
+{
+   uint64_t killed = 0;
+
+   list_for_each_entry(gpir_instr, instr, &ctx->block->instr_list, list) {
+      if (instr->index <= min_index)
+         break;
+
+      for (int slot = GPIR_INSTR_SLOT_STORE0; slot <= GPIR_INSTR_SLOT_STORE3; 
+           slot++) {
+         if (!instr->slots[slot])
+            continue;
+
+         gpir_store_node *store = gpir_node_to_store(instr->slots[slot]);
+         if (store->node.op != gpir_op_store_reg)
+            continue;
+
+         killed |= 1ull << (4 * store->index + store->component);
+      }
+   }
+
+   return killed;
+}
+
+/* Actually spill a node so that it is no longer in the ready list. Note that
+ * this must exactly follow the logic of get_available_regs() or else the
+ * loads could fail to schedule.
+ */
+
+static void spill_node(sched_ctx *ctx, gpir_node *node, gpir_store_node *store)
+{
+   gpir_node_foreach_succ_safe(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *use = dep->succ;
+      gpir_instr *instr = use->sched.instr;
+
+      if (!instr)
+         continue;
+
+      if (use->op == gpir_op_mov && instr == ctx->instr) {
+         spill_node(ctx, use, store);
+      } else {
+         gpir_load_node *load = gpir_node_create(ctx->block, gpir_op_load_reg);
+         load->index = store->index;
+         load->component = store->component;
+         list_add(&load->node.list, &ctx->block->node_list);
+         gpir_node_replace_child(dep->succ, dep->pred, &load->node);
+         gpir_node_replace_pred(dep, &load->node);
+         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
+         gpir_debug("spilling use %d of node %d to load node %d\n",
+                    use->index, node->index, load->node.index);
+         MAYBE_UNUSED bool result = _try_place_node(ctx, use->sched.instr, &load->node);
+         assert(result);
+      }
+   }
+
+   if (node->op == gpir_op_mov) {
+      /* We replaced all the uses of the move, so it's dead now. */
+      gpir_instr_remove_node(node->sched.instr, node);
+      gpir_node_delete(node);
+   } else {
+      /* We deleted all the uses of the node except the store, so it's not
+       * live anymore.
+       */
       list_del(&node->list);
-      node->sched.ready = false;
       node->sched.inserted = false;
-      gpir_node_replace_succ(move, node);
-      gpir_node_add_dep(move, node, GPIR_DEP_INPUT);
-      return move;
+      ctx->ready_list_slots--;
+      if (node->sched.max_node) {
+         node->sched.max_node = false;
+         ctx->instr->alu_num_slot_needed_by_max--;
+      }
+      if (node->sched.next_max_node) {
+         node->sched.next_max_node = false;
+         ctx->instr->alu_num_slot_needed_by_next_max--;
+      }
    }
 }
 
-static bool gpir_is_input_node(gpir_node *node)
+static bool used_by_store(gpir_node *node, gpir_instr *instr)
 {
    gpir_node_foreach_succ(node, dep) {
-      if (dep->type == GPIR_DEP_INPUT)
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      if (dep->succ->type == gpir_node_type_store &&
+          dep->succ->sched.instr == instr)
          return true;
    }
+
    return false;
 }
 
-static int gpir_get_min_scheduled_succ(gpir_node *node)
+
+
+static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
+{
+   assert(node->op != gpir_op_mov);
+
+   if (used_by_store(node, ctx->instr))
+      return false;
+
+   gpir_debug("trying to spill %d\n", node->index);
+
+   int min_instr = INT_MAX;
+   uint64_t available = get_available_regs(ctx, node, &min_instr);
+   available &= ~get_killed_regs(ctx, min_instr);
+
+   if (node->sched.physreg_store) {
+      gpir_store_node *store = node->sched.physreg_store;
+      if (!(available & (1ull << (4 * store->index + store->component))))
+         return false;
+   } else {
+      available &= ~ctx->live_physregs;
+
+      if (available == 0)
+         return false;
+
+      /* TODO: use a better heuristic for choosing an available register? */
+      int physreg = ffsll(available) - 1;
+
+      ctx->live_physregs |= (1ull << physreg);
+
+      /* TODO: when we support multiple basic blocks, there may be register
+       * loads/stores to this register other than this one that haven't been
+       * scheduled yet so we may need to insert write-after-read dependencies.
+       */
+      gpir_store_node *store = gpir_node_create(ctx->block, gpir_op_store_reg);
+      store->index = physreg / 4;
+      store->component = physreg % 4;
+      store->child = node;
+      store->node.sched.max_node = false;
+      store->node.sched.next_max_node = false;
+      store->node.sched.pos = -1;
+      store->node.sched.instr = NULL;
+      store->node.sched.inserted = false;
+      store->node.sched.dist = node->sched.dist;
+      if (node->op == gpir_op_complex1) {
+         /* Complex1 cannot be directly stored, and has a latency of 2 */
+         store->node.sched.dist += 2;
+      }
+      node->sched.physreg_store = store;
+      gpir_node_add_dep(&store->node, node, GPIR_DEP_INPUT);
+      node->sched.ready = false;
+      schedule_insert_ready_list(ctx, &store->node);
+   }
+
+   gpir_debug("spilling %d to $%d.%c, store %d\n", node->index,
+              node->sched.physreg_store->index,
+              "xyzw"[node->sched.physreg_store->component],
+              node->sched.physreg_store->node.index);
+
+   spill_node(ctx, node, node->sched.physreg_store);
+
+   return true;
+}
+
+static bool try_spill_nodes(sched_ctx *ctx, gpir_node *orig_node)
+{
+   /* First, try to spill max nodes. */
+   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
+      if (ctx->max_node_spill_needed <= 0)
+         break;
+
+      /* orig_node is the node we're trying to schedule, so spilling it makes
+       * no sense. Also don't try to spill any nodes in front of it, since
+       * they might be scheduled instead.
+       */
+      if (node == orig_node)
+         break;
+
+      if (node->op == gpir_op_mov) {
+         /* Don't try to spill loads, since that only adds another load and
+          * store which is likely pointless.
+          */
+         continue;
+      }
+
+      if (!gpir_is_input_node(node) || !node->sched.max_node)
+         continue;
+
+      if (try_spill_node(ctx, node)) {
+         ctx->max_node_spill_needed--;
+         ctx->total_spill_needed--;
+      }
+   }
+
+   /* Now, try to spill the remaining nodes. */
+   list_for_each_entry_safe_rev(gpir_node, node, &ctx->ready_list, list) {
+      if (ctx->total_spill_needed <= 0)
+         break;
+
+      if (node == orig_node)
+         break;
+
+      if (node->op == gpir_op_mov)
+         continue;
+
+      if (!gpir_is_input_node(node) ||
+          !(node->sched.max_node || node->sched.next_max_node))
+         continue;
+
+      if (try_spill_node(ctx, node))
+         ctx->total_spill_needed--;
+   }
+
+   return ctx->total_spill_needed <= 0 && ctx->max_node_spill_needed <= 0;
+}
+
+static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
+{
+   int total = 0;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      total += gpir_get_slots_required(node);
+   }
+
+   return total;
+}
+
+/* What gpir_get_min_end() would return if node were replaced with a move
+ * instruction not in the complex slot. Normally this is 2 + min_end, except
+ * for some store instructions which must have the move node in the same
+ * instruction.
+ */
+static int gpir_get_min_end_as_move(gpir_node *node)
 {
    int min = INT_MAX;
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
-      if (succ->sched.instr >= 0 && dep->type == GPIR_DEP_INPUT) {
-         if (min > succ->sched.instr)
-            min = succ->sched.instr;
+      if (succ->sched.instr && dep->type == GPIR_DEP_INPUT) {
+         switch (succ->op) {
+         case gpir_op_store_temp:
+         case gpir_op_store_reg:
+         case gpir_op_store_varying:
+            continue;
+         default:
+            break;
+         }
+         if (min > succ->sched.instr->index + 2)
+            min = succ->sched.instr->index + 2;
       }
    }
    return min;
 }
 
-static gpir_node *gpir_sched_instr_pass(gpir_instr *instr,
-                                        struct list_head *ready_list)
+/* Initialize node->sched.max_node and node->sched.next_max_node for every
+ * input node on the ready list. We should only need to do this once per
+ * instruction, at the beginning, since we never add max nodes to the ready
+ * list.
+ */
+
+static void sched_find_max_nodes(sched_ctx *ctx)
 {
-   /* fully ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-      if (node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
+   ctx->instr->alu_num_slot_needed_by_next_max = -5;
+   ctx->instr->alu_num_slot_needed_by_max = 0;
 
-         gpir_debug("fully ready max node %d\n", node->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node))
+         continue;
 
-         if (schedule_try_place_node(instr, node))
-            return node;
+      int min_end_move = gpir_get_min_end_as_move(node);
+      node->sched.max_node = (min_end_move == ctx->instr->index);
+      node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
 
-         return gpir_sched_node(instr, node);
-      }
+      if (node->sched.max_node)
+         ctx->instr->alu_num_slot_needed_by_max++;
+      if (node->sched.next_max_node)
+         ctx->instr->alu_num_slot_needed_by_next_max++;
    }
+}
 
-   /* partially ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-      if (!node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
+/* Verify the invariants described in gpir.h, as well as making sure the
+ * counts are correct.
+ */
+static void verify_max_nodes(sched_ctx *ctx)
+{
+   int alu_num_slot_needed_by_max = 0;
+   int alu_num_slot_needed_by_next_max = -5;
+   int alu_num_slot_needed_by_store = 0;
 
-         gpir_debug("partially ready max node %d\n", node->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (!gpir_is_input_node(node))
+         continue;
 
-         return gpir_sched_node(instr, node);
-      }
+      if (node->sched.max_node)
+         alu_num_slot_needed_by_max++;
+      if (node->sched.next_max_node)
+         alu_num_slot_needed_by_next_max++;
+      if (used_by_store(node, ctx->instr))
+         alu_num_slot_needed_by_store++;
    }
 
-   /* schedule node used by previous instr when count > 5 */
-   int count = 0;
-   gpir_node *two_slot_node = NULL;
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (gpir_is_input_node(node)) {
-         int min = gpir_get_min_scheduled_succ(node);
-         assert(min >= instr->index - 1);
-         if (min == instr->index - 1) {
-            if (gpir_op_infos[node->op].may_consume_two_slots) {
-               two_slot_node = node;
-               count += 2;
+   assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
+   assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
+   assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
+   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+}
+
+static bool try_node(sched_ctx *ctx)
+{
+   gpir_node *best_node = NULL;
+   int best_score = INT_MIN;
+
+   /* Spilling will delete arbitrary nodes after the current one in the ready
+    * list, which means that we always need to look up the next node in the
+    * list at the end of each iteration. While list_for_each_entry() works for
+    * this purpose, its sanity checking assumes that you don't want to modify
+    * the list at all. We know better here, so we have to open-code
+    * list_for_each_entry() without the check in order to not assert.
+    */
+   for (gpir_node *node = LIST_ENTRY(gpir_node, ctx->ready_list.next, list);
+        &node->list != &ctx->ready_list;
+        node = LIST_ENTRY(gpir_node, node->list.next, list)) {
+      if (best_score != INT_MIN) {
+         if (node->sched.dist < best_node->sched.dist)
+            break;
+      }
+
+      if (node->sched.ready) {
+         ctx->total_spill_needed = 0;
+         ctx->max_node_spill_needed = 0;
+         int score = schedule_try_node(ctx, node, true);
+         if (score == INT_MIN) {
+            if (ctx->total_spill_needed > 0 &&
+                try_spill_nodes(ctx, node)) {
+               score = schedule_try_node(ctx, node, true);
+               if (score == INT_MIN)
+                  continue;
             }
-            else
-               count++;
+         }
+
+         if (score > best_score) {
+            best_score = score;
+            best_node = node;
          }
       }
    }
 
-   if (count > 5) {
-      /* When no slot avaible, must schedule a move for two slot node
-       * to reduce the count. This results from the dummy_m/f method.
-       */
-      if (gpir_instr_alu_slot_is_full(instr)) {
-         assert(two_slot_node);
-         gpir_debug("instr is full, schedule move node for two slot node %d\n",
-                    two_slot_node->index);
-         return gpir_sched_node(instr, two_slot_node);
-      }
-
-      /* schedule fully ready node first */
-      list_for_each_entry(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 ready node %d\n", node->index);
-
-               if (schedule_try_place_node(instr, node))
-                  return node;
-            }
-         }
+   if (best_node) {
+      gpir_debug("scheduling %d (score = %d)%s\n", best_node->index,
+                 best_score, best_node->sched.max_node ? " (max)" : "");
+      MAYBE_UNUSED int score = schedule_try_node(ctx, best_node, false);
+      assert(score != INT_MIN);
+      return true;
+   }
+
+   return false;
+}
+
+static void place_move(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_node *move = create_move(ctx, node);
+   gpir_node_foreach_succ_safe(move, dep) {
+      gpir_node *succ = dep->succ;
+      if (!succ->sched.instr ||
+          ctx->instr->index < succ->sched.instr->index + gpir_get_min_dist(dep)) {
+         gpir_node_replace_pred(dep, node);
+         if (dep->type == GPIR_DEP_INPUT)
+            gpir_node_replace_child(succ, move, node);
       }
+   }
+   MAYBE_UNUSED int score = schedule_try_node(ctx, move, false);
+   assert(score != INT_MIN);
+}
 
-      /* no fully ready node be scheduled, schedule partially ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && !node->sched.ready) {
-               gpir_debug(">5 partially ready node %d\n", node->index);
+static bool sched_move(sched_ctx *ctx)
+{
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (node->sched.max_node) {
+         place_move(ctx, node);
+         return true;
+      }
+   }
 
-               return gpir_sched_node(instr, node);
+   if (ctx->instr->alu_num_slot_needed_by_store > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (used_by_store(node, ctx->instr)) {
+            place_move(ctx, node);
+            /* If we have a store of a load, then we need to make sure that we
+             * immediately schedule the dependent load, or create a move
+             * instruction for it, like we would with a normal instruction.
+             * The rest of the code isn't set up to handle load nodes in the
+             * ready list -- see the comments in _schedule_try_node().
+             */
+            if (node->type == gpir_node_type_load) {
+               if (!schedule_try_place_node(ctx, node, false)) {
+                  create_move(ctx, node);
+               }
             }
+            return true;
          }
       }
+   }
 
-      /* finally schedule move for fully ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 fully ready move node %d\n", node->index);
-
-               return gpir_sched_node(instr, node);
+   /* complex1 is a bit a special case, since it has a latency of 2 cycles.
+    * Once it is fully ready, we need to group all its uses in the same
+    * instruction, and then we need to avoid creating any moves in the next
+    * cycle in order to get it scheduled. Failing to do any of these things
+    * could result in a cycle penalty, or even worse, an infinite loop of
+    * inserting moves. If it is a next-max node and ready, then it has a use
+    * in the previous cycle. If it has a use in the current cycle as well,
+    * then we want to insert a move node to make it ready in two cycles -- if
+    * we don't, then there will be at least a one cycle penalty. Otherwise, it
+    * will be ready next cycle, and we shouldn't insert a move node, or else
+    * we'll also have a one cycle penalty. 
+    */
+   if (ctx->instr->alu_num_slot_free > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
+             node->sched.ready) {
+            bool skip = true;
+            gpir_node_foreach_succ(node, dep) {
+               if (dep->type != GPIR_DEP_INPUT)
+                  continue;
+
+               gpir_node *succ = dep->succ;
+
+               if (!succ->sched.instr ||
+                   succ->sched.instr->index != ctx->instr->index - 1) {
+                  skip = false;
+                  break;
+               }
             }
+
+            if (skip)
+               continue;
+
+            place_move(ctx, node);
+            return true;
          }
       }
    }
 
-   /* schedule remain fully ready nodes */
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (node->sched.ready) {
-         gpir_debug("remain fully ready node %d\n", node->index);
+   /* Once we've made all the required moves, we're free to use any extra
+    * slots to schedule more moves for next max nodes. Besides sometimes being
+    * necessary, this can free up extra space in the next instruction. We walk
+    * from back to front so that we pick nodes less likely to be scheduled
+    * next first -- an extra move would be unnecessary there. But make sure
+    * not to handle the complex1 case handled above.
+    */
+   if (ctx->instr->alu_num_slot_free > 0) {
+      list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node &&
+             !(node->op == gpir_op_complex1 && node->sched.ready)) {
+            place_move(ctx, node);
+            return true;
+         }
+      }
+   }
 
-         if (schedule_try_place_node(instr, node))
-            return node;
+   /* We may have skipped complex1 above, but if we run out of space, we still
+    * need to insert the move.
+    */
+
+   if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (node->sched.next_max_node) {
+            place_move(ctx, node);
+            return true;
+         }
       }
    }
 
-   return NULL;
+
+   return false;
 }
 
-static void schedule_print_pre_one_instr(gpir_instr *instr,
-                                         struct list_head *ready_list)
+static bool gpir_sched_instr_pass(sched_ctx *ctx)
+{
+   if (try_node(ctx))
+      return true;
+
+   if (sched_move(ctx))
+      return true;
+
+   return false;
+}
+
+static void schedule_print_pre_one_instr(sched_ctx *ctx)
 {
    if (!(lima_debug & LIMA_DEBUG_GP))
       return;
 
-   printf("instr %d for ready list:", instr->index);
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      printf(" %d/%c", node->index, node->sched.ready ? 'r' : 'p');
+   printf("instr %d for ready list:", ctx->instr->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      printf(" %d/%c (%d, %d, %s)", node->index, node->sched.ready ? 'r' : 'p',
+             node->sched.dist, gpir_get_slots_required(node),
+             node->sched.max_node ? "max" : (node->sched.next_max_node ? "next" : "none"));
+   }
+   printf("\nlive physregs: ");
+   for (unsigned i = 0; i < 16; i++) {
+      if (ctx->live_physregs & (0xfull << (4 * i))) {
+         printf("$%d.", i);
+         for (unsigned j = 0; j < 4; j++) {
+            if (ctx->live_physregs & (1ull << (4 * i + j)))
+               printf("%c", "xyzw"[j]);
+         }
+         printf(" ");
+      }
    }
    printf("\n");
 }
@@ -569,30 +1359,23 @@ static void schedule_print_post_one_instr(gpir_instr *instr)
 }
 
 
-static bool schedule_one_instr(gpir_block *block, struct list_head *ready_list)
+static bool schedule_one_instr(sched_ctx *ctx)
 {
-   gpir_instr *instr = gpir_instr_create(block);
+   gpir_instr *instr = gpir_instr_create(ctx->block);
    if (unlikely(!instr))
       return false;
 
-   schedule_print_pre_one_instr(instr, ready_list);
-
-   while (true) {
-      gpir_node *node = gpir_sched_instr_pass(instr, ready_list);
-      if (!node)
-         break;
+   ctx->instr = instr;
 
-      if (node->sched.instr < 0)
-         schedule_insert_ready_list(ready_list, node);
-      else {
-         list_del(&node->list);
-         list_add(&node->list, &block->node_list);
+   sched_find_max_nodes(ctx);
+   schedule_print_pre_one_instr(ctx);
 
-         gpir_node_foreach_pred(node, dep) {
-            gpir_node *pred = dep->pred;
-            schedule_insert_ready_list(ready_list, pred);
-         }
-      }
+   while (gpir_sched_instr_pass(ctx)) {
+      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
+#ifndef NDEBUG
+      verify_max_nodes(ctx);
+      verify_ready_list(ctx);
+#endif
    }
 
    schedule_print_post_one_instr(instr);
@@ -607,45 +1390,33 @@ static bool schedule_block(gpir_block *block)
          schedule_update_distance(node);
    }
 
-   struct list_head ready_list;
-   list_inithead(&ready_list);
+   sched_ctx ctx;
+   list_inithead(&ctx.ready_list);
+   ctx.block = block;
+   ctx.ready_list_slots = 0;
+   /* TODO initialize with block live out once we have proper liveness
+    * tracking
+    */
+   ctx.live_physregs = 0;
 
    /* construct the ready list from root nodes */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
       if (gpir_node_is_root(node))
-         schedule_insert_ready_list(&ready_list, node);
+         schedule_insert_ready_list(&ctx, node);
    }
 
    list_inithead(&block->node_list);
-   while (!list_empty(&ready_list)) {
-      if (!schedule_one_instr(block, &ready_list))
+   while (!list_empty(&ctx.ready_list)) {
+      if (!schedule_one_instr(&ctx))
          return false;
    }
 
    return true;
 }
 
-static void schedule_build_vreg_dependency(gpir_block *block)
+static void schedule_build_dependency(gpir_block *block)
 {
-   gpir_node *regs[GPIR_VALUE_REG_NUM] = {0};
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      /* store node has no value reg assigned */
-      if (node->value_reg < 0)
-         continue;
-
-      gpir_node *reg = regs[node->value_reg];
-      if (reg) {
-         gpir_node_foreach_succ(reg, dep) {
-            /* write after read dep should only apply to real 'read' */
-            if (dep->type != GPIR_DEP_INPUT)
-               continue;
-
-            gpir_node *succ = dep->succ;
-            gpir_node_add_dep(node, succ, GPIR_DEP_VREG_WRITE_AFTER_READ);
-         }
-      }
-      regs[node->value_reg] = node;
-   }
+   gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0};
 
    /* merge dummy_f/m to the node created from */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
@@ -666,76 +1437,49 @@ static void schedule_build_vreg_dependency(gpir_block *block)
          gpir_node_delete(node);
       }
    }
-}
 
-static void schedule_build_preg_dependency(gpir_compiler *comp)
-{
-   /* merge reg with the same index */
-   gpir_reg *regs[GPIR_VALUE_REG_NUM] = {0};
-   list_for_each_entry(gpir_reg, reg, &comp->reg_list, list) {
-      if (!regs[reg->index])
-         regs[reg->index] = reg;
-      else {
-         list_splicetail(&reg->defs_list, &regs[reg->index]->defs_list);
-         list_splicetail(&reg->uses_list, &regs[reg->index]->uses_list);
+   /* Forward dependencies. We only need to add these for register loads,
+    * since value registers already have an input dependency.
+    */
+   list_for_each_entry(gpir_node, node, &block->node_list, list) {
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         unsigned index = 4 * load->index + load->component;
+         if (last_written[index]) {
+            gpir_node_add_dep(node, last_written[index], GPIR_DEP_READ_AFTER_WRITE);
+         }
       }
+
+      if (node->value_reg >= 0)
+         last_written[node->value_reg] = node;
    }
 
-   /* calculate physical reg read/write dependency for load/store nodes */
-   for (int i = 0; i < GPIR_VALUE_REG_NUM; i++) {
-      gpir_reg *reg = regs[i];
-      if (!reg)
-         continue;
+   memset(last_written, 0, sizeof(last_written));
 
-      /* sort reg write */
-      struct list_head tmp_list;
-      list_replace(&reg->defs_list, &tmp_list);
-      list_inithead(&reg->defs_list);
-      list_for_each_entry_safe(gpir_store_node, store, &tmp_list, reg_link) {
-         struct list_head *insert_pos = &reg->defs_list;
-         list_for_each_entry(gpir_store_node, st, &reg->defs_list, reg_link) {
-            if (st->node.sched.index > store->node.sched.index) {
-               insert_pos = &st->reg_link;
-               break;
-            }
+   /* False dependencies. For value registers, these exist only to make sure
+    * that the maximum pressure isn't exceeded and are hence "fake".
+    */
+   list_for_each_entry_rev(gpir_node, node, &block->node_list, list) {
+      if (node->op == gpir_op_load_reg) {
+         gpir_load_node *load = gpir_node_to_load(node);
+         unsigned index = 4 * load->index + load->component;
+         if (last_written[index]) {
+            gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ);
          }
-         list_del(&store->reg_link);
-         list_addtail(&store->reg_link, insert_pos);
-      }
-
-      /* sort reg read */
-      list_replace(&reg->uses_list, &tmp_list);
-      list_inithead(&reg->uses_list);
-      list_for_each_entry_safe(gpir_load_node, load, &tmp_list, reg_link) {
-         struct list_head *insert_pos = &reg->uses_list;
-         list_for_each_entry(gpir_load_node, ld, &reg->uses_list, reg_link) {
-            if (ld->node.sched.index > load->node.sched.index) {
-               insert_pos = &ld->reg_link;
-               break;
+      } else {
+         gpir_node_foreach_pred(node, dep) {
+            if (dep->type == GPIR_DEP_INPUT) {
+               int index = dep->pred->value_reg;
+               if (index >= 0 && last_written[index]) {
+                  gpir_node_add_dep(last_written[index], node,
+                                    GPIR_DEP_WRITE_AFTER_READ);
+               }
             }
          }
-         list_del(&load->reg_link);
-         list_addtail(&load->reg_link, insert_pos);
-      }
-
-      /* insert dependency */
-      gpir_store_node *store =
-         list_first_entry(&reg->defs_list, gpir_store_node, reg_link);
-      gpir_store_node *next = store->reg_link.next != &reg->defs_list ?
-         list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL;
-
-      list_for_each_entry(gpir_load_node, load, &reg->uses_list, reg_link) {
-         /* loop until load is between store and next */
-         while (next && next->node.sched.index < load->node.sched.index) {
-            store = next;
-            next = store->reg_link.next != &reg->defs_list ?
-               list_first_entry(&store->reg_link, gpir_store_node, reg_link) : NULL;
-         }
-
-         gpir_node_add_dep(&load->node, &store->node, GPIR_DEP_READ_AFTER_WRITE);
-         if (next)
-            gpir_node_add_dep(&next->node, &load->node, GPIR_DEP_WRITE_AFTER_READ);
       }
+
+      if (node->value_reg >= 0)
+         last_written[node->value_reg] = node;
    }
 }
 
@@ -792,20 +1536,25 @@ bool gpir_schedule_prog(gpir_compiler *comp)
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
       block->sched.instr_index = 0;
       list_for_each_entry(gpir_node, node, &block->node_list, list) {
-         node->sched.instr = -1;
+         node->sched.instr = NULL;
          node->sched.pos = -1;
          node->sched.index = index++;
          node->sched.dist = -1;
+         /* TODO when we support multiple basic blocks, we need a way to keep
+          * track of this for physregs allocated before the scheduler.
+          */
+         node->sched.physreg_store = NULL;
          node->sched.ready = false;
          node->sched.inserted = false;
+         node->sched.max_node = false;
+         node->sched.next_max_node = false;
       }
    }
 
-   /* build fake/virtual dependency */
+   /* build dependency */
    list_for_each_entry(gpir_block, block, &comp->block_list, list) {
-      schedule_build_vreg_dependency(block);
+      schedule_build_dependency(block);
    }
-   schedule_build_preg_dependency(comp);
 
    //gpir_debug("after scheduler build reg dependency\n");
    //gpir_node_print_prog_dep(comp);
diff --git a/src/gallium/drivers/lima/meson.build b/src/gallium/drivers/lima/meson.build
index 1db8c7ea8f9..0f13b9c7272 100644
--- a/src/gallium/drivers/lima/meson.build
+++ b/src/gallium/drivers/lima/meson.build
@@ -29,8 +29,7 @@ files_lima = files(
   'ir/gp/codegen.h',
   'ir/gp/codegen.c',
   'ir/gp/reduce_scheduler.c',
-  'ir/gp/value_regalloc.c',
-  'ir/gp/physical_regalloc.c',
+  'ir/gp/regalloc.c',
   'ir/gp/disasm.c',
 
   'ir/pp/ppir.h',