aboutsummaryrefslogtreecommitdiffstats
path: root/src/intel/compiler
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2020-04-03 20:20:53 -0500
committerMarge Bot <[email protected]>2020-04-17 14:48:06 +0000
commitd0d039a4d3f49f3cc89e167b46edae54cf32a6be (patch)
tree5292eaecd4fd82cc7428ec6b8a5f87b4215c78af /src/intel/compiler
parenteb5a10ff63f74f9e052ecc6c7399df8e0d193345 (diff)
anv: Emit pushed UBO bounds checking code in the back-end compiler
This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
Diffstat (limited to 'src/intel/compiler')
-rw-r--r--src/intel/compiler/brw_compiler.h13
-rw-r--r--src/intel/compiler/brw_fs.cpp43
2 files changed, 56 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 62c7e85e55f..ab39af22684 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -657,6 +657,19 @@ struct brw_stage_prog_data {
GLuint nr_params; /**< number of float params/constants */
GLuint nr_pull_params;
+ /* zero_push_reg is a bitfield which indicates what push registers (if any)
+ * should be zeroed by SW at the start of the shader. The corresponding
+ * push_reg_mask_param specifies the param index (in 32-bit units) where
+ * the actual runtime 64-bit mask will be pushed. The shader will zero
+ * push reg i if
+ *
+ * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
+ *
+ * If this field is set, brw_compiler::compact_params must be false.
+ */
+ uint64_t zero_push_reg;
+ unsigned push_reg_mask_param;
+
unsigned curb_read_length;
unsigned total_scratch;
unsigned total_shared;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4e13dcca54a..b578d82a252 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1604,6 +1604,8 @@ fs_visitor::assign_curb_setup()
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
+ uint64_t used = 0;
+
/* Map the offsets in the UNIFORM file to fixed HW regs. */
foreach_block_and_inst(block, fs_inst, inst, cfg) {
for (unsigned int i = 0; i < inst->sources; i++) {
@@ -1625,6 +1627,9 @@ fs_visitor::assign_curb_setup()
constant_nr = 0;
}
+ assert(constant_nr / 8 < 64);
+ used |= BITFIELD64_BIT(constant_nr / 8);
+
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
constant_nr / 8,
constant_nr % 8);
@@ -1639,6 +1644,44 @@ fs_visitor::assign_curb_setup()
}
}
+ uint64_t want_zero = used & stage_prog_data->zero_push_reg;
+ if (want_zero) {
+ assert(!compiler->compact_params);
+ fs_builder ubld = bld.exec_all().group(8, 0).at(
+ cfg->first_block(), cfg->first_block()->start());
+
+ /* push_reg_mask_param is in 32-bit units */
+ unsigned mask_param = stage_prog_data->push_reg_mask_param;
+ struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
+ mask_param % 8);
+
+ fs_reg b32;
+ for (unsigned i = 0; i < 64; i++) {
+ if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+ fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
+ ubld.SHL(horiz_offset(shifted, 8),
+ byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
+ brw_imm_v(0x01234567));
+ ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+
+ fs_builder ubld16 = ubld.group(16, 0);
+ b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
+ ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
+ }
+
+ if (want_zero & BITFIELD64_BIT(i)) {
+ assert(i < prog_data->curb_read_length);
+ struct brw_reg push_reg =
+ retype(brw_vec8_grf(payload.num_regs + i, 0),
+ BRW_REGISTER_TYPE_D);
+
+ ubld.AND(push_reg, push_reg, component(b32, i % 16));
+ }
+ }
+
+ invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+ }
+
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
}