intel/fs/gen7+: Implement discard/demote for SIMD32 programs.

At this point this simply involves fixing the initialization of the sample mask flag register to take the right dispatch mask from the thread payload, and fixing sample_mask_reg() to return f1.1 for the second half of a SIMD32 thread. This improves Manhattan 3.1 performance by 2.4%±0.31% (N>40) on my ICL with SIMD32 enabled relative to falling back to SIMD16 for the shaders that use discard. Reviewed-by: Kenneth Graunke <[email protected]>
author: Francisco Jerez <[email protected]> 2020-01-04 16:16:24 -0800
committer: Francisco Jerez <[email protected]> 2020-02-14 14:31:49 -0800
commit: 8d3b86e34a7b0f77613c7f5669891e54d76f0cbf (patch)
tree: 9be8a8bc1d12f5084df7c66b51660eda136a599a /src/intel/compiler
parent: 04c7d3d4b19610cae8250102fefd0012b7233d9e (diff)
2 files changed, 14 insertions, 8 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 963d1c18155..fd9217b24b2 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4285,8 +4285,8 @@ sample_mask_reg(const fs_builder &bld)
    if (v->stage != MESA_SHADER_FRAGMENT) {
       return brw_imm_ud(0xffffffff);
    } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
-      assert(bld.group() < 16 && bld.dispatch_width() <= 16);
-      return brw_flag_subreg(sample_mask_flag_subreg(v));
+      assert(bld.dispatch_width() <= 16);
+      return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
    } else {
       assert(v->devinfo->gen >= 6 && bld.dispatch_width() <= 16);
       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
@@ -8171,11 +8171,15 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         const fs_reg dispatch_mask =
-            devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
-         bld.exec_all().group(1, 0)
-            .MOV(sample_mask_reg(bld),
-                 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         const unsigned lower_width = MIN2(dispatch_width, 16);
+         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
+            const fs_reg dispatch_mask =
+               devinfo->gen >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
+               brw_vec1_grf(0, 0);
+            bld.exec_all().group(1, 0)
+               .MOV(sample_mask_reg(bld.group(lower_width, i)),
+                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
+         }
       }
 
       emit_nir_code();
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 5d66ead4a24..3b34c407f51 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3562,7 +3562,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          emit_discard_jump();
       }
 
-      limit_dispatch_width(16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
+      if (devinfo->gen < 7)
+         limit_dispatch_width(
+            16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
       break;
    }
author	Francisco Jerez <[email protected]>	2020-01-04 16:16:24 -0800
committer	Francisco Jerez <[email protected]>	2020-02-14 14:31:49 -0800
commit	8d3b86e34a7b0f77613c7f5669891e54d76f0cbf (patch)
tree	9be8a8bc1d12f5084df7c66b51660eda136a599a /src/intel/compiler
parent	04c7d3d4b19610cae8250102fefd0012b7233d9e (diff)