intel/fs: Emit HALT for discard on Gen4-5

Using HALT to immediately jump to the end of the shader is required to implement GL_EXT_gpu_shader4 and OpenGL 3.0. However, vanilla OpenGL 1.2 doesn't forbid it and it likely makes something somewhere faster. We should be consistent and implement the same discard behavior on all hardware if we can. The rules for HALT on Gen4-5 are a bit different from Gen6+. On the older hardware, there is no stack for HALT; instead it's up to software to save and restore mask registers. However, there's no real saving needed since we only use HALT to jump to the end of the program where we're about about to do our FB writes. All we need to do is reset AMask to DMask, the value it was initialized to at the start of the thread. Reviewed-by: Kenneth Graunke <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5244>
author: Jason Ekstrand <[email protected]> 2020-04-25 14:59:30 -0500
committer: Marge Bot <[email protected]> 2020-05-30 06:21:15 +0000
commit: c48f42e178a1cc484870367c0cfe5fbbf71d86cc (patch)
tree: 306f90e708bf4adafddb2c0b23e0ab325dcc87ad
parent: 94aa7997e45b5314d169bbee5bf22ad368c2fd25 (diff)
6 files changed, 107 insertions, 30 deletions
diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
index ff46cb9549a..486f55d7bec 100644
--- a/src/intel/compiler/brw_disasm.c
+++ b/src/intel/compiler/brw_disasm.c
@@ -708,6 +708,9 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
          format(file, "mask%d", _reg_nr & 0x0f);
          break;
       case BRW_ARF_MASK_STACK:
+         format(file, "ms%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK_STACK_DEPTH:
          format(file, "msd%d", _reg_nr & 0x0f);
          break;
       case BRW_ARF_STATE:
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 262c527b2e9..9ed481e10e6 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1102,7 +1102,7 @@ brw_inst *brw_WHILE(struct brw_codegen *p);
 
 brw_inst *brw_BREAK(struct brw_codegen *p);
 brw_inst *brw_CONT(struct brw_codegen *p);
-brw_inst *gen6_HALT(struct brw_codegen *p);
+brw_inst *brw_HALT(struct brw_codegen *p);
 
 /* Forward jumps:
  */
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 12042131999..cc1bc8cc13f 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1725,14 +1725,23 @@ brw_CONT(struct brw_codegen *p)
 }
 
 brw_inst *
-gen6_HALT(struct brw_codegen *p)
+brw_HALT(struct brw_codegen *p)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    brw_inst *insn;
 
    insn = next_insn(p, BRW_OPCODE_HALT);
    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
-   if (devinfo->gen < 8) {
+   if (devinfo->gen < 6) {
+      /* From the Gen4 PRM:
+       *
+       *    "IP register must be put (for example, by the assembler) at <dst>
+       *    and <src0> locations.
+       */
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
+   } else if (devinfo->gen < 8) {
       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
    } else if (devinfo->gen < 12) {
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 5825e0770d4..bee9816d815 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -224,25 +224,27 @@ public:
 bool
 fs_generator::patch_discard_jumps_to_fb_writes()
 {
-   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+   if (this->discard_halt_patches.is_empty())
       return false;
 
    int scale = brw_jump_scale(p->devinfo);
 
-   /* There is a somewhat strange undocumented requirement of using
-    * HALT, according to the simulator.  If some channel has HALTed to
-    * a particular UIP, then by the end of the program, every channel
-    * must have HALTed to that UIP.  Furthermore, the tracking is a
-    * stack, so you can't do the final halt of a UIP after starting
-    * halting to a new UIP.
-    *
-    * Symptoms of not emitting this instruction on actual hardware
-    * included GPU hangs and sparkly rendering on the piglit discard
-    * tests.
-    */
-   brw_inst *last_halt = gen6_HALT(p);
-   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
-   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   if (devinfo->gen >= 6) {
+      /* There is a somewhat strange undocumented requirement of using
+       * HALT, according to the simulator.  If some channel has HALTed to
+       * a particular UIP, then by the end of the program, every channel
+       * must have HALTed to that UIP.  Furthermore, the tracking is a
+       * stack, so you can't do the final halt of a UIP after starting
+       * halting to a new UIP.
+       *
+       * Symptoms of not emitting this instruction on actual hardware
+       * included GPU hangs and sparkly rendering on the piglit discard
+       * tests.
+       */
+      brw_inst *last_halt = brw_HALT(p);
+      brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+      brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+   }
 
    int ip = p->nr_insn;
 
@@ -250,11 +252,67 @@ fs_generator::patch_discard_jumps_to_fb_writes()
       brw_inst *patch = &p->store[patch_ip->ip];
 
       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
-      /* HALT takes a half-instruction distance from the pre-incremented IP. */
-      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      if (devinfo->gen >= 6) {
+         /* HALT takes a half-instruction distance from the pre-incremented IP. */
+         brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+      } else {
+         brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
+      }
    }
 
    this->discard_halt_patches.make_empty();
+
+   if (devinfo->gen < 6) {
+      /* From the g965 PRM:
+       *
+       *    "As DMask is not automatically reloaded into AMask upon completion
+       *    of this instruction, software has to manually restore AMask upon
+       *    completion."
+       *
+       * DMask lives in the bottom 16 bits of sr0.1.
+       */
+      brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
+                                   retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
+      brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
+      brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
+      brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
+      brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
+   }
+
+   if (devinfo->gen == 4 && !devinfo->is_g4x) {
+      /* From the g965 PRM:
+       *
+       *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
+       *    reset to zero during graphics reset, however, they are not
+       *    initialized at thread dispatch. These subfields will retain the
+       *    values from the previous thread. Software should make sure the
+       *    mask stack is empty (reset to zero) before terminating the thread.
+       *    In case that this is not practical, software may have to reset the
+       *    mask stack at the beginning of each kernel, which will impact the
+       *    performance."
+       *
+       * Luckily we can rely on:
+       *
+       *    "[DevBW, DevCL] This register access restriction is not
+       *    applicable, hardware does ensure execution pipeline coherency,
+       *    when a mask stack register is used as an explicit source and/or
+       *    destination."
+       */
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_2);
+      brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
+
+      brw_set_default_exec_size(p, BRW_EXECUTE_16);
+      /* Reset the if stack. */
+      brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0));
+
+      brw_pop_insn_state(p);
+   }
+
    return true;
 }
 
@@ -1362,14 +1420,12 @@ fs_generator::generate_ddy(const fs_inst *inst,
 void
 fs_generator::generate_discard_jump(fs_inst *)
 {
-   assert(devinfo->gen >= 6);
-
    /* This HALT will be patched up at FB write time to point UIP at the end of
     * the program, and at brw_uip_jip() JIP will be set to the end of the
     * current block (or the program).
     */
    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
-   gen6_HALT(p);
+   brw_HALT(p);
 }
 
 void
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 8916f9b876e..37c1d2f4cc9 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3482,13 +3482,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
       cmp->predicate = BRW_PREDICATE_NORMAL;
       cmp->flag_subreg = sample_mask_flag_subreg(this);
 
-      if (devinfo->gen >= 6) {
-         /* Due to the way we implement discard, the jump will only happen
-          * when the whole quad is discarded.  So we can do this even for
-          * demote as it won't break its uniformity promises.
-          */
-         emit_discard_jump();
-      }
+      emit_discard_jump();
 
       if (devinfo->gen < 7)
          limit_dispatch_width(
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index 865cd9e0f36..620b8722d92 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -940,6 +940,21 @@ brw_dmask_reg()
 }
 
 static inline struct brw_reg
+brw_mask_stack_reg(unsigned subnr)
+{
+   return suboffset(retype(brw_vec16_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                         BRW_ARF_MASK_STACK, 0),
+                           BRW_REGISTER_TYPE_UB), subnr);
+}
+
+static inline struct brw_reg
+brw_mask_stack_depth_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_MASK_STACK_DEPTH, subnr);
+}
+
+static inline struct brw_reg
 brw_message_reg(unsigned nr)
 {
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0);
author	Jason Ekstrand <[email protected]>	2020-04-25 14:59:30 -0500
committer	Marge Bot <[email protected]>	2020-05-30 06:21:15 +0000
commit	c48f42e178a1cc484870367c0cfe5fbbf71d86cc (patch)
tree	306f90e708bf4adafddb2c0b23e0ab325dcc87ad
parent	94aa7997e45b5314d169bbee5bf22ad368c2fd25 (diff)