From 99b2c8570ea6f46c6564681631f0e0750a0641cc Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 19 Oct 2010 09:25:51 -0700
Subject: i965: Add support for register spilling.

It can be tested with if (0) replaced with if (1) to force spilling for all
virtual GRFs.  Some simple tests work, but large texturing tests fail.
---
 src/mesa/drivers/dri/i965/brw_eu.h                |  15 +-
 src/mesa/drivers/dri/i965/brw_eu_emit.c           |  96 +++++++++----
 src/mesa/drivers/dri/i965/brw_fs.cpp              |  73 +++++++++-
 src/mesa/drivers/dri/i965/brw_fs.h                |  15 +-
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 163 +++++++++++++++++++++-
 src/mesa/drivers/dri/i965/brw_wm_emit.c           |   8 +-
 6 files changed, 315 insertions(+), 55 deletions(-)

(limited to 'src/mesa/drivers/dri')

diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 8ffa7c760ac..0e3ccfa46c8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -897,9 +897,11 @@ void brw_math2(struct brw_compile *p,
 	       struct brw_reg src0,
 	       struct brw_reg src1);
 
-void brw_dp_READ_16( struct brw_compile *p,
-		     struct brw_reg dest,
-		     GLuint scratch_offset );
+void brw_oword_block_read(struct brw_compile *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  int num_regs,
+			  GLuint offset);
 
 void brw_dp_READ_4( struct brw_compile *p,
                     struct brw_reg dest,
@@ -918,9 +920,10 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
 			       GLuint offset,
 			       GLuint bind_table_index);
 
-void brw_dp_WRITE_16( struct brw_compile *p,
-		      struct brw_reg src,
-		      GLuint scratch_offset );
+void brw_oword_block_write(struct brw_compile *p,
+			   struct brw_reg mrf,
+			   int num_regs,
+			   GLuint offset);
 
 /* If/else/endif.  Works by manipulating the execution flags on each
  * channel.
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 399b99c9607..63ab5c26f36 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1353,16 +1353,29 @@ void brw_math_16( struct brw_compile *p,
 
 
 /**
- * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
- * Scratch offset should be a multiple of 64.
- * Used for register spilling.
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes).  Used for
+ * register spilling.
  */
-void brw_dp_WRITE_16( struct brw_compile *p,
-		      struct brw_reg src,
-		      GLuint scratch_offset )
+void brw_oword_block_write(struct brw_compile *p,
+			   struct brw_reg mrf,
+			   int num_regs,
+			   GLuint offset)
 {
    struct intel_context *intel = &p->brw->intel;
-   GLuint msg_reg_nr = 1;
+   uint32_t msg_control;
+   int mlen;
+
+   if (num_regs == 1) {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+      mlen = 2;
+   } else {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+      mlen = 3;
+   }
+
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
@@ -1371,20 +1384,24 @@ void brw_dp_WRITE_16( struct brw_compile *p,
       /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-	      brw_imm_d(scratch_offset));
+	      brw_imm_d(offset));
 
       brw_pop_insn_state(p);
    }
 
    {
-      GLuint msg_length = 3;
       struct brw_reg dest;
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
       int send_commit_msg;
+      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
+					 BRW_REGISTER_TYPE_UW);
 
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
+      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
+	 insn->header.compression_control = BRW_COMPRESSION_NONE;
+	 src_header = vec16(src_header);
+      }
+      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
+      insn->header.destreg__conditionalmod = mrf.nr;
 
       /* Until gen6, writes followed by reads from the same location
        * are not guaranteed to be ordered unless write_commit is set.
@@ -1400,19 +1417,19 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
 	 send_commit_msg = 0;
       } else {
-	 dest = brw_uw16_grf(0, 0);
+	 dest = src_header;
 	 send_commit_msg = 1;
       }
 
       brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
+      brw_set_src0(insn, src_header);
 
       brw_set_dp_write_message(p->brw,
 			       insn,
 			       255, /* binding table index (255=stateless) */
-			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       msg_control,
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
-			       msg_length,
+			       mlen,
 			       GL_TRUE, /* header_present */
 			       0, /* pixel scoreboard */
 			       send_commit_msg, /* response_length */
@@ -1423,15 +1440,32 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 
 
 /**
- * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
- * Scratch offset should be a multiple of 64.
- * Used for register spilling.
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes).  Used for register
+ * spilling.
  */
-void brw_dp_READ_16( struct brw_compile *p,
-		      struct brw_reg dest,
-		      GLuint scratch_offset )
+void
+brw_oword_block_read(struct brw_compile *p,
+		     struct brw_reg dest,
+		     struct brw_reg mrf,
+		     int num_regs,
+		     GLuint offset)
 {
-   GLuint msg_reg_nr = 1;
+   uint32_t msg_control;
+   int rlen;
+
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+   if (num_regs == 1) {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+      rlen = 1;
+   } else {
+      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
+      rlen = 2;
+   }
+
    {
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1440,29 +1474,29 @@ void brw_dp_READ_16( struct brw_compile *p,
       /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-	      brw_imm_d(scratch_offset));
+	      brw_imm_d(offset));
 
       brw_pop_insn_state(p);
    }
 
    {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditionalmod = msg_reg_nr;
-  
+
+      assert(insn->header.predicate_control == 0);
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditionalmod = mrf.nr;
+
       brw_set_dest(insn, dest);	/* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
       brw_set_dp_read_message(p->brw,
 			      insn,
 			      255, /* binding table index (255=stateless) */
-			      BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
+			      msg_control,
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
 			      1, /* target cache (render/scratch) */
 			      1, /* msg_length */
-			      2, /* response_length */
+			      rlen,
 			      0); /* eot */
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bc39d1c29a1..f38976aa8a8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -179,10 +179,6 @@ type_size(const struct glsl_type *type)
    }
 }
 
-static const fs_reg reg_undef;
-static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
-static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
-
 int
 fs_visitor::virtual_grf_alloc(int size)
 {
@@ -2226,6 +2222,47 @@ fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
    brw_pop_insn_state(p);
 }
 
+void
+fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
+{
+   assert(inst->mlen != 0);
+
+   brw_MOV(p,
+	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
+	   retype(src, BRW_REGISTER_TYPE_UD));
+   brw_oword_block_write(p, brw_message_reg(inst->base_mrf), 1, inst->offset);
+}
+
+void
+fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->mlen != 0);
+
+   /* Clear any post destination dependencies that would be ignored by
+    * the block read.  See the B-Spec for pre-gen5 send instruction.
+    *
+    * This could use a better solution, since texture sampling and
+    * math reads could potentially run into it as well -- anywhere
+    * that we have a SEND with a destination that is a register that
+    * was written but not read within the last N instructions (what's
+    * N?  unsure).  This is rare because of dead code elimination, but
+    * not impossible.
+    */
+   if (intel->gen == 4 && !intel->is_g4x)
+      brw_MOV(p, brw_null_reg(), dst);
+
+   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1,
+			inst->offset);
+
+   if (intel->gen == 4 && !intel->is_g4x) {
+      /* gen4 errata: destination from a send can't be used as a
+       * destination until it's been read.  Just read it so we don't
+       * have to worry.
+       */
+      brw_MOV(p, brw_null_reg(), dst);
+   }
+}
+
 void
 fs_visitor::assign_curb_setup()
 {
@@ -3040,6 +3077,15 @@ fs_visitor::generate_code()
       case FS_OPCODE_DDY:
 	 generate_ddy(inst, dst, src[0]);
 	 break;
+
+      case FS_OPCODE_SPILL:
+	 generate_spill(inst, src[0]);
+	 break;
+
+      case FS_OPCODE_UNSPILL:
+	 generate_unspill(inst, dst);
+	 break;
+
       case FS_OPCODE_FB_WRITE:
 	 generate_fb_write(inst);
 	 break;
@@ -3145,10 +3191,25 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 	 progress = v.dead_code_eliminate() || progress;
       } while (progress);
 
+      if (0) {
+	 /* Debug of register spilling: Go spill everything. */
+	 int virtual_grf_count = v.virtual_grf_next;
+	 for (int i = 1; i < virtual_grf_count; i++) {
+	    v.spill_reg(i);
+	 }
+	 v.calculate_live_intervals();
+      }
+
       if (0)
 	 v.assign_regs_trivial();
-      else
-	 v.assign_regs();
+      else {
+	 while (!v.assign_regs()) {
+	    if (v.fail)
+	       break;
+
+	    v.calculate_live_intervals();
+	 }
+      }
    }
 
    if (!v.fail)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b8126083f9e..de7137a7db3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -74,6 +74,8 @@ enum fs_opcodes {
    FS_OPCODE_TXL,
    FS_OPCODE_DISCARD_NOT,
    FS_OPCODE_DISCARD_AND,
+   FS_OPCODE_SPILL,
+   FS_OPCODE_UNSPILL,
 };
 
 
@@ -196,6 +198,7 @@ public:
       this->shadow_compare = false;
       this->mlen = 0;
       this->base_mrf = 0;
+      this->offset = 0;
    }
 
    fs_inst()
@@ -281,6 +284,7 @@ public:
    bool eot;
    bool header_present;
    bool shadow_compare;
+   uint32_t offset; /* spill/unspill offset */
 
    /** @{
     * Annotation for the generated IR.  One of the two can be set.
@@ -359,8 +363,10 @@ public:
    void assign_curb_setup();
    void calculate_urb_setup();
    void assign_urb_setup();
-   void assign_regs();
+   bool assign_regs();
    void assign_regs_trivial();
+   int choose_spill_reg(struct ra_graph *g);
+   void spill_reg(int spill_reg);
    void split_virtual_grfs();
    void calculate_live_intervals();
    bool propagate_constants();
@@ -378,6 +384,8 @@ public:
    void generate_discard_and(fs_inst *inst, struct brw_reg temp);
    void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
    void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src);
+   void generate_spill(fs_inst *inst, struct brw_reg src);
+   void generate_unspill(fs_inst *inst, struct brw_reg dst);
 
    void emit_dummy_fs();
    fs_reg *emit_fragcoord_interpolation(ir_variable *ir);
@@ -391,6 +399,7 @@ public:
    fs_inst *emit_math(fs_opcodes op, fs_reg dst, fs_reg src0, fs_reg src1);
    void emit_bool_to_cond_code(ir_rvalue *condition);
    void emit_if_gen6(ir_if *ir);
+   void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset);
 
    void emit_fb_writes();
    void emit_assignment_writes(fs_reg &l, fs_reg &r,
@@ -444,5 +453,9 @@ public:
    int grf_used;
 };
 
+static const fs_reg reg_undef;
+static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
+static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
+
 GLboolean brw_do_channel_expressions(struct exec_list *instructions);
 GLboolean brw_do_vector_splitting(struct exec_list *instructions);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 2dda8acb387..b5bfd00d5fe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -84,7 +84,7 @@ fs_visitor::assign_regs_trivial()
    this->grf_used = last_grf + 1;
 }
 
-void
+bool
 fs_visitor::assign_regs()
 {
    int last_grf = 0;
@@ -220,11 +220,22 @@ fs_visitor::assign_regs()
       }
    }
 
-   /* FINISHME: Handle spilling */
    if (!ra_allocate_no_spills(g)) {
-      fprintf(stderr, "Failed to allocate registers.\n");
-      this->fail = true;
-      return;
+      /* Failed to allocate registers.  Spill a reg, and the caller will
+       * loop back into here to try again.
+       */
+      int reg = choose_spill_reg(g);
+      if (reg == -1) {
+	 this->fail = true;
+      } else {
+	 spill_reg(reg);
+      }
+
+
+      talloc_free(g);
+      talloc_free(regs);
+
+      return false;
    }
 
    /* Get the chosen virtual registers for each node, and map virtual
@@ -262,4 +273,146 @@ fs_visitor::assign_regs()
 
    talloc_free(g);
    talloc_free(regs);
+
+   return true;
+}
+
+void
+fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset)
+{
+   int size = virtual_grf_sizes[dst.reg];
+   dst.reg_offset = 0;
+
+   for (int chan = 0; chan < size; chan++) {
+      fs_inst *unspill_inst = new(mem_ctx) fs_inst(FS_OPCODE_UNSPILL,
+						   dst);
+      dst.reg_offset++;
+      unspill_inst->offset = spill_offset + chan * REG_SIZE;
+      unspill_inst->ir = inst->ir;
+      unspill_inst->annotation = inst->annotation;
+
+      /* Choose a MRF that won't conflict with an MRF that's live across the
+       * spill.  Nothing else will make it up to MRF 14/15.
+       */
+      unspill_inst->base_mrf = 14;
+      unspill_inst->mlen = 1; /* header contains offset */
+      inst->insert_before(unspill_inst);
+   }
+}
+
+int
+fs_visitor::choose_spill_reg(struct ra_graph *g)
+{
+   float loop_scale = 1.0;
+   float spill_costs[this->virtual_grf_next];
+   bool no_spill[this->virtual_grf_next];
+
+   for (int i = 0; i < this->virtual_grf_next; i++) {
+      spill_costs[i] = 0.0;
+      no_spill[i] = false;
+   }
+
+   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
+    * spill/unspill we'll have to do, and guess that the insides of
+    * loops run 10 times.
+    */
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      fs_inst *inst = (fs_inst *)iter.get();
+
+      for (unsigned int i = 0; i < 3; i++) {
+	 if (inst->src[i].file == GRF) {
+	    int size = virtual_grf_sizes[inst->src[i].reg];
+	    spill_costs[inst->src[i].reg] += size * loop_scale;
+	 }
+      }
+
+      if (inst->dst.file == GRF) {
+	 int size = virtual_grf_sizes[inst->dst.reg];
+	 spill_costs[inst->dst.reg] += size * loop_scale;
+      }
+
+      switch (inst->opcode) {
+
+      case BRW_OPCODE_DO:
+	 loop_scale *= 10;
+	 break;
+
+      case BRW_OPCODE_WHILE:
+	 loop_scale /= 10;
+	 break;
+
+      case FS_OPCODE_SPILL:
+	 if (inst->src[0].file == GRF)
+	    no_spill[inst->src[0].reg] = true;
+	 break;
+
+      case FS_OPCODE_UNSPILL:
+	 if (inst->dst.file == GRF)
+	    no_spill[inst->dst.reg] = true;
+	 break;
+      }
+   }
+
+   for (int i = 0; i < this->virtual_grf_next; i++) {
+      if (!no_spill[i])
+	 ra_set_node_spill_cost(g, i, spill_costs[i]);
+   }
+
+   return ra_get_best_spill_node(g);
+}
+
+void
+fs_visitor::spill_reg(int spill_reg)
+{
+   int size = virtual_grf_sizes[spill_reg];
+   unsigned int spill_offset = c->last_scratch;
+   assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+   c->last_scratch += size * REG_SIZE;
+
+   /* Generate spill/unspill instructions for the objects being
+    * spilled.  Right now, we spill or unspill the whole thing to a
+    * virtual grf of the same size.  For most instructions, though, we
+    * could just spill/unspill the GRF being accessed.
+    */
+   foreach_iter(exec_list_iterator, iter, this->instructions) {
+      fs_inst *inst = (fs_inst *)iter.get();
+
+      for (unsigned int i = 0; i < 3; i++) {
+	 if (inst->src[i].file == GRF &&
+	     inst->src[i].reg == spill_reg) {
+	    inst->src[i].reg = virtual_grf_alloc(size);
+	    emit_unspill(inst, inst->src[i], spill_offset);
+	 }
+      }
+
+      if (inst->dst.file == GRF &&
+	  inst->dst.reg == spill_reg) {
+	 inst->dst.reg = virtual_grf_alloc(size);
+
+	 /* Since we spill/unspill the whole thing even if we access
+	  * just a component, we may need to unspill before the
+	  * instruction we're spilling for.
+	  */
+	 if (size != 1 || inst->predicated) {
+	    emit_unspill(inst, inst->dst, spill_offset);
+	 }
+
+	 fs_reg spill_src = inst->dst;
+	 spill_src.reg_offset = 0;
+	 spill_src.abs = false;
+	 spill_src.negate = false;
+
+	 for (int chan = 0; chan < size; chan++) {
+	    fs_inst *spill_inst = new(mem_ctx) fs_inst(FS_OPCODE_SPILL,
+						       reg_null_f, spill_src);
+	    spill_src.reg_offset++;
+	    spill_inst->offset = spill_offset + chan * REG_SIZE;
+	    spill_inst->ir = inst->ir;
+	    spill_inst->annotation = inst->annotation;
+	    spill_inst->base_mrf = 14;
+	    spill_inst->mlen = 2; /* header, value */
+	    inst->insert_after(spill_inst);
+	 }
+      }
+   }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index cb71c665b47..88bc64e5dd9 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1576,9 +1576,7 @@ static void emit_spill( struct brw_wm_compile *c,
      mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
      send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
    */
-   brw_dp_WRITE_16(p, 
-		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
-		   slot);
+   brw_oword_block_write(p, brw_message_reg(1), 2, slot);
 }
 
 
@@ -1603,9 +1601,7 @@ static void emit_unspill( struct brw_wm_compile *c,
      send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
    */
 
-   brw_dp_READ_16(p,
-		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
-		  slot);
+   brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
 }
 
 
-- 
cgit v1.2.3