From b56fa830c6095f8226456b2aeb62f2dfad804be5 Mon Sep 17 00:00:00 2001
From: Francisco Jerez <currojerez@riseup.net>
Date: Thu, 8 Dec 2016 19:18:00 -0800
Subject: i965/fs: Fetch one cacheline of pull constants at a time.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Asking the DC for less than one cacheline (4 owords) of data for
uniform pull constants is suboptimal because the DC cannot request
less than that from L3, resulting in wasted bandwidth and unnecessary
message dispatch overhead, and exacerbating the IVB L3 serialization
bug.  The following table summarizes the overall framerate improvement
(with statistical significance of 5% and sample size ~10) from the
whole series up to this patch for several benchmarks and hardware
generations:

                         | SKL           | BDW          | HSW
SynMark2 OglShMapPcf     | 24.63% ±0.45% | 4.01% ±0.70% | 10.31% ±0.38%
GfxBench4 gl_manhattan31 |  5.93% ±0.35% | 3.92% ±0.31% |  6.62% ±0.22%
GfxBench4 gl_4           |  2.52% ±0.44% | 1.23% ±0.10% |      N/A
Unigine Valley           |  0.83% ±0.17% | 0.23% ±0.05% |  0.74% ±0.45%

Note that there are two versions of the Manhattan demo shipped with
GfxBench4, one of them is the original gl_manhattan demo which doesn't
use UBOs, so this patch will have no effect on it, and another one is
the gl_manhattan31 demo based on GL 4.3/GLES 3.1, which this patch
benefits as shown above.

I haven't observed any statistically significant regressions in the
benchmarks I have at hand.  Note that the comparatively huge
improvement on SKL in the OglShMapPcf test case is due to the combined
effect of this patch and the register pressure benefit on SKL+ of
"i965/fs: Switch to the constant cache for uniform pull constants.",
part of the same series.

Going up to 8 oword blocks would improve performance of pull constants
even more, but at the cost of some additional bandwidth and register
pressure, so it would have to be done on-demand based on the number of
constants actually used by the shader.

v2: Fix for Gen4 and 5.
v3: Non-trivial rebase.  Rework to allow the visitor specifiy
    arbitrary pull constant block sizes.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 21 +++++++++------------
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 16 +++++++++-------
 2 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'src/mesa')

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 977fd8c35f5..671b44bd50d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2111,25 +2111,22 @@ fs_visitor::lower_constant_loads()
          if (pull_index == -1)
 	    continue;
 
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst;
-
-         if (type_sz(inst->src[i].type) <= 4)
-            dst = vgrf(glsl_type::float_type);
-         else
-            dst = vgrf(glsl_type::double_type);
-
          assert(inst->src[i].stride == 0);
 
-         const fs_builder ubld = ibld.exec_all().group(4, 0);
-         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
          ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                   dst, brw_imm_ud(index), offset);
+                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
 
          /* Rewrite the instruction to use the temporary VGRF. */
          inst->src[i].file = VGRF;
          inst->src[i].nr = dst.nr;
-         inst->src[i].offset = (pull_index & 3) * 4 + inst->src[i].offset % 4;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
 
          brw_mark_surface_used(prog_data, index);
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 7df74232457..9f2729a9b6e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -4059,21 +4059,23 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           * and we have to split it if necessary.
           */
          const unsigned type_size = type_sz(dest.type);
-         const fs_builder ubld = bld.exec_all().group(4, 0);
-         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_F);
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
+         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
 
          for (unsigned c = 0; c < instr->num_components;) {
             const unsigned base = const_offset->u32[0] + c * type_size;
-
-            /* Number of usable components in the next 16B-aligned load */
+            /* Number of usable components in the next block-aligned load. */
             const unsigned count = MIN2(instr->num_components - c,
-                                        (16 - base % 16) / type_size);
+                                        (block_sz - base % block_sz) / type_size);
 
             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      packed_consts, surf_index, brw_imm_ud(base & ~15));
+                      packed_consts, surf_index,
+                      brw_imm_ud(base & ~(block_sz - 1)));
 
             const fs_reg consts =
-               retype(byte_offset(packed_consts, base & 15), dest.type);
+               retype(byte_offset(packed_consts, base & (block_sz - 1)),
+                      dest.type);
 
             for (unsigned d = 0; d < count; d++)
                bld.MOV(offset(dest, bld, c + d), component(consts, d));
-- 
cgit v1.2.3