i965/fs: Improve performance of shaders that start out with a discard.

I had tried this in the past, but ran into trouble with applications that sample from undiscarded pixels in the same subspan. To fix that issue, only jump to the end for an entire subspan at a time. Improves GLbenchmark 2.7 (1024x768) performance by 7.9 +/- 1.5% (n=8). v2: Drop the br variable in the jump instruction -- if I ever do jumps pre-gen6, it'll be a different code block anyway since we don't have HALT until gen6. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
author: Eric Anholt <eric@anholt.net> 2012-12-06 10:15:08 -0800
committer: Eric Anholt <eric@anholt.net> 2012-12-11 10:13:15 -0800
commit: beafced21c3c11315a8b94f20508562729453175 (patch)
tree: 72e4c1b18972ac7f05310eab9a0515cf7e1f61d4
parent: d5016495cc1b50b1673d0d3ab8e6af8249b071d5 (diff)
6 files changed, 148 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 2b77ae62ff3..40571a4d54d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -679,6 +679,7 @@ enum opcode {
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+   FS_OPCODE_DISCARD_JUMP,
 
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index adefcfd9bad..b2fa44864db 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -1032,6 +1032,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p);
 struct brw_instruction *brw_BREAK(struct brw_compile *p);
 struct brw_instruction *brw_CONT(struct brw_compile *p);
 struct brw_instruction *gen6_CONT(struct brw_compile *p);
+struct brw_instruction *gen6_HALT(struct brw_compile *p);
 /* Forward jumps:
  */
 void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index fb1255f728c..dd91a3087c8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1461,6 +1461,24 @@ struct brw_instruction *brw_CONT(struct brw_compile *p)
    return insn;
 }
 
+struct brw_instruction *gen6_HALT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+
+   if (p->compressed) {
+      insn->header.execution_size = BRW_EXECUTE_16;
+   } else {
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = BRW_EXECUTE_8;
+   }
+   return insn;
+}
+
 /* DO/WHILE loop:
  *
  * The DO/WHILE is just an unterminated loop -- break or continue are
@@ -2302,8 +2320,8 @@ brw_find_next_block_end(struct brw_compile *p, int start)
 	 return ip;
       }
    }
-   assert(!"not reached");
-   return start + 1;
+
+   return 0;
 }
 
 /* There is no DO instruction on gen6, so to find the end of the loop
@@ -2336,7 +2354,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
 }
 
 /* After program generation, go back and update the UIP and JIP of
- * BREAK and CONT instructions to their correct locations.
+ * BREAK, CONT, and HALT instructions to their correct locations.
  */
 void
 brw_set_uip_jip(struct brw_compile *p)
@@ -2360,24 +2378,45 @@ brw_set_uip_jip(struct brw_compile *p)
 	 continue;
       }
 
+      int block_end_ip = brw_find_next_block_end(p, ip);
       switch (insn->header.opcode) {
       case BRW_OPCODE_BREAK:
-	 insn->bits3.break_cont.jip =
-            (brw_find_next_block_end(p, ip) - ip) / scale;
+         assert(block_end_ip != 0);
+	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
 	 insn->bits3.break_cont.uip =
 	    (brw_find_loop_end(p, ip) - ip +
              (intel->gen == 6 ? 16 : 0)) / scale;
 	 break;
       case BRW_OPCODE_CONTINUE:
-	 insn->bits3.break_cont.jip =
-            (brw_find_next_block_end(p, ip) - ip) / scale;
+         assert(block_end_ip != 0);
+	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
 	 insn->bits3.break_cont.uip =
             (brw_find_loop_end(p, ip) - ip) / scale;
 
 	 assert(insn->bits3.break_cont.uip != 0);
 	 assert(insn->bits3.break_cont.jip != 0);
 	 break;
+      case BRW_OPCODE_HALT:
+	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+	  *
+	  *    "In case of the halt instruction not inside any conditional
+	  *     code block, the value of <JIP> and <UIP> should be the
+	  *     same. In case of the halt instruction inside conditional code
+	  *     block, the <UIP> should be the end of the program, and the
+	  *     <JIP> should be end of the most inner conditional code block."
+	  *
+	  * The uip will have already been set by whoever set up the
+	  * instruction.
+	  */
+	 if (block_end_ip == 0) {
+	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
+	 } else {
+	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
+	 }
+	 assert(insn->bits3.break_cont.uip != 0);
+	 assert(insn->bits3.break_cont.jip != 0);
+	 break;
       }
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b00755f953d..d4ddb478fd2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -129,6 +129,26 @@ static const fs_reg reg_undef;
 static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F);
 static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D);
 
+class ip_record : public exec_node {
+public:
+   static void* operator new(size_t size, void *ctx)
+   {
+      void *node;
+
+      node = rzalloc_size(ctx, size);
+      assert(node != NULL);
+
+      return node;
+   }
+
+   ip_record(int ip)
+   {
+      this->ip = ip;
+   }
+
+   int ip;
+};
+
 class fs_inst : public backend_instruction {
 public:
    /* Callers of this ralloc-based new need not call delete. It's
@@ -516,6 +536,9 @@ private:
                                                  struct brw_reg index,
                                                  struct brw_reg offset);
    void generate_mov_dispatch_to_flags(fs_inst *inst);
+   void generate_discard_jump(fs_inst *inst);
+
+   void patch_discard_jumps_to_fb_writes();
 
    struct brw_context *brw;
    struct intel_context *intel;
@@ -530,6 +553,7 @@ private:
 
    unsigned dispatch_width; /**< 8 or 16 */
 
+   exec_list discard_halt_patches;
    bool dual_source_output;
    void *mem_ctx;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index f185eb52104..9a891414e62 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -61,12 +61,53 @@ fs_generator::~fs_generator()
 }
 
 void
+fs_generator::patch_discard_jumps_to_fb_writes()
+{
+   if (intel->gen < 6 || this->discard_halt_patches.is_empty())
+      return;
+
+   /* There is a somewhat strange undocumented requirement of using
+    * HALT, according to the simulator.  If some channel has HALTed to
+    * a particular UIP, then by the end of the program, every channel
+    * must have HALTed to that UIP.  Furthermore, the tracking is a
+    * stack, so you can't do the final halt of a UIP after starting
+    * halting to a new UIP.
+    *
+    * Symptoms of not emitting this instruction on actual hardware
+    * included GPU hangs and sparkly rendering on the piglit discard
+    * tests.
+    */
+   struct brw_instruction *last_halt = gen6_HALT(p);
+   last_halt->bits3.break_cont.uip = 2;
+   last_halt->bits3.break_cont.jip = 2;
+
+   int ip = p->nr_insn;
+
+   foreach_list(node, &this->discard_halt_patches) {
+      ip_record *patch_ip = (ip_record *)node;
+      struct brw_instruction *patch = &p->store[patch_ip->ip];
+
+      assert(patch->header.opcode == BRW_OPCODE_HALT);
+      /* HALT takes a half-instruction distance from the pre-incremented IP. */
+      patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
+   }
+
+   this->discard_halt_patches.make_empty();
+}
+
+void
 fs_generator::generate_fb_write(fs_inst *inst)
 {
    bool eot = inst->eot;
    struct brw_reg implied_header;
    uint32_t msg_control;
 
+   /* Note that the jumps emitted to this point mean that the g0 ->
+    * base_mrf setup must be inside of this function, so that we jump
+    * to a point containing it.
+    */
+   patch_discard_jumps_to_fb_writes();
+
    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
     * move, here's g1.
     */
@@ -525,6 +566,23 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src
 }
 
 void
+fs_generator::generate_discard_jump(fs_inst *inst)
+{
+   assert(intel->gen >= 6);
+
+   /* This HALT will be patched up at FB write time to point UIP at the end of
+    * the program, and at brw_uip_jip() JIP will be set to the end of the
+    * current block (or the program).
+    */
+   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   gen6_HALT(p);
+   brw_pop_insn_state(p);
+}
+
+void
 fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
 {
    assert(inst->mlen != 0);
@@ -1085,6 +1143,10 @@ fs_generator::generate_code(exec_list *instructions)
          generate_mov_dispatch_to_flags(inst);
          break;
 
+      case FS_OPCODE_DISCARD_JUMP:
+         generate_discard_jump(inst);
+         break;
+
       case SHADER_OPCODE_SHADER_TIME_ADD:
          brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 98cd064aceb..56d43309f9d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1446,6 +1446,20 @@ fs_visitor::visit(ir_discard *ir)
                            BRW_CONDITIONAL_NZ));
    cmp->predicate = BRW_PREDICATE_NORMAL;
    cmp->flag_subreg = 1;
+
+   if (intel->gen >= 6) {
+      /* For performance, after a discard, jump to the end of the shader.
+       * However, many people will do foliage by discarding based on a
+       * texture's alpha mask, and then continue on to texture with the
+       * remaining pixels.  To avoid trashing the derivatives for those
+       * texture samples, we'll only jump if all of the pixels in the subspan
+       * have been discarded.
+       */
+      fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+      discard_jump->flag_subreg = 1;
+      discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
+      discard_jump->predicate_inverse = true;
+   }
 }
 
 void
author	Eric Anholt <eric@anholt.net>	2012-12-06 10:15:08 -0800
committer	Eric Anholt <eric@anholt.net>	2012-12-11 10:13:15 -0800
commit	beafced21c3c11315a8b94f20508562729453175 (patch)
tree	72e4c1b18972ac7f05310eab9a0515cf7e1f61d4
parent	d5016495cc1b50b1673d0d3ab8e6af8249b071d5 (diff)