intel/fs: Handle surface opcode sample masks via predication.

The main motivation is to enable HDC surface opcodes on ICL which no longer allows the sample mask to be provided in a message header, but this is enabled all the way back to IVB when possible because it decreases the instruction count of some shaders using HDC messages significantly, e.g. one of the SynMark2 CSDof compute shaders decreases instruction count by about 40% due to the removal of header setup boilerplate which in turn makes a number of send message payloads more easily CSE-able. Shader-db results on SKL: total instructions in shared programs: 15325319 -> 15314384 (-0.07%) instructions in affected programs: 311532 -> 300597 (-3.51%) helped: 491 HURT: 1 Shader-db results on BDW where the optimization needs to be disabled in some cases due to hardware restrictions: total instructions in shared programs: 15604794 -> 15598028 (-0.04%) instructions in affected programs: 220863 -> 214097 (-3.06%) helped: 351 HURT: 0 The FPS of SynMark2 CSDof improves by 5.09% ±0.36% (n=10) on my SKL laptop with this change. According to Eero this improves performance of the same test by 9% on BYT and by 7-8% on BXT J4205 and on SKL GT2 desktop. Reviewed-by: Kenneth Graunke <[email protected]> Tested-By: Eero Tamminen <[email protected]>
author: Francisco Jerez <[email protected]> 2017-12-12 12:05:04 -0800
committer: Francisco Jerez <[email protected]> 2018-03-02 11:28:56 -0800
commit: c063e88909e630bb4605037eb0fc072f40f8c2a2 (patch)
tree: b67886f38f7584466647081a28209d65a772378b /src/intel/compiler/brw_fs.cpp
parent: e7c9adca5726a8c96de20ae7c5f21a30061db392 (diff)
1 files changed, 42 insertions, 1 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c255a3b23b5..b1e1d98f6e6 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4460,6 +4460,8 @@ static void
 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
                            const fs_reg &sample_mask)
 {
+   const gen_device_info *devinfo = bld.shader->devinfo;
+
    /* Get the logical send arguments. */
    const fs_reg &addr = inst->src[0];
    const fs_reg &src = inst->src[1];
@@ -4470,7 +4472,20 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
    /* Calculate the total number of components of the payload. */
    const unsigned addr_sz = inst->components_read(0);
    const unsigned src_sz = inst->components_read(1);
-   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   /* From the BDW PRM Volume 7, page 147:
+    *
+    *  "For the Data Cache Data Port*, the header must be present for the
+    *   following message types: [...] Typed read/write/atomics"
+    *
+    * Earlier generations have a similar wording.  Because of this restriction
+    * we don't attempt to implement sample masks via predication for such
+    * messages prior to Gen9, since we have to provide a header anyway.  On
+    * Gen11+ the header has been removed so we can only use predication.
+    */
+   const unsigned header_sz = devinfo->gen < 9 &&
+                              (op == SHADER_OPCODE_TYPED_SURFACE_READ ||
+                               op == SHADER_OPCODE_TYPED_SURFACE_WRITE ||
+                               op == SHADER_OPCODE_TYPED_ATOMIC) ? 1 : 0;
    const unsigned sz = header_sz + addr_sz + src_sz;
 
    /* Allocate space for the payload. */
@@ -4490,6 +4505,32 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
 
    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
 
+   /* Predicate the instruction on the sample mask if no header is
+    * provided.
+    */
+   if (!header_sz && sample_mask.file != BAD_FILE &&
+       sample_mask.file != IMM) {
+      const fs_builder ubld = bld.group(1, 0).exec_all();
+      if (inst->predicate) {
+         assert(inst->predicate == BRW_PREDICATE_NORMAL);
+         assert(!inst->predicate_inverse);
+         assert(inst->flag_subreg < 2);
+         /* Combine the sample mask with the existing predicate by using a
+          * vertical predication mode.
+          */
+         inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
+         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2),
+                         sample_mask.type),
+                  sample_mask);
+      } else {
+         inst->flag_subreg = 2;
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->predicate_inverse = false;
+         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
+                  sample_mask);
+      }
+   }
+
    /* Update the original instruction. */
    inst->opcode = op;
    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
author	Francisco Jerez <[email protected]>	2017-12-12 12:05:04 -0800
committer	Francisco Jerez <[email protected]>	2018-03-02 11:28:56 -0800
commit	c063e88909e630bb4605037eb0fc072f40f8c2a2 (patch)
tree	b67886f38f7584466647081a28209d65a772378b /src/intel/compiler/brw_fs.cpp
parent	e7c9adca5726a8c96de20ae7c5f21a30061db392 (diff)