aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2020-03-27 00:30:25 -0500
committerMarge Bot <[email protected]>2020-03-30 15:46:19 +0000
commitc217ee8d35fcac8ab11e7b5bfd0e053e1fed7df0 (patch)
tree4624ec2aef44b2839143d9d9ec633a5448959b84
parentd2dfcee7f7ebf87dae9570f1c7476eacb6240f83 (diff)
nir: Insert b2b1s around booleans in nir_lower_to
By inserting a b2b1 around the load_ubo, load_input, etc. intrinsics generated by nir_lower_io, we can ensure that the intrinsic has the correct destination bit size. Not having the right size can mess up passes which try to optimize access. In particular, it was causing brw_nir_analyze_ubo_ranges to ignore load_ubo of booleans which meant that booleans uniforms weren't getting pushed as push constants. I don't think this is an actual functional bug anywhere hence no CC to stable but it may improve perf somewhere. Shader-db results on ICL with iris: total instructions in shared programs: 16076707 -> 16075246 (<.01%) instructions in affected programs: 129034 -> 127573 (-1.13%) helped: 487 HURT: 0 helped stats (abs) min: 3 max: 3 x̄: 3.00 x̃: 3 helped stats (rel) min: 0.45% max: 3.00% x̄: 1.33% x̃: 1.36% 95% mean confidence interval for instructions value: -3.00 -3.00 95% mean confidence interval for instructions %-change: -1.37% -1.29% Instructions are helped. total cycles in shared programs: 338015639 -> 337983311 (<.01%) cycles in affected programs: 971986 -> 939658 (-3.33%) helped: 362 HURT: 110 helped stats (abs) min: 1 max: 1664 x̄: 97.37 x̃: 43 helped stats (rel) min: 0.03% max: 36.22% x̄: 5.58% x̃: 2.60% HURT stats (abs) min: 1 max: 554 x̄: 26.55 x̃: 18 HURT stats (rel) min: 0.03% max: 10.99% x̄: 1.04% x̃: 0.96% 95% mean confidence interval for cycles value: -79.97 -57.01 95% mean confidence interval for cycles %-change: -4.60% -3.47% Cycles are helped. total sends in shared programs: 815037 -> 814550 (-0.06%) sends in affected programs: 5701 -> 5214 (-8.54%) helped: 487 HURT: 0 LOST: 2 GAINED: 0 The two lost programs were SIMD16 shaders in CS:GO. However, CS:GO was also one of the most helped programs where it shaves sends off of 134 programs. This seems to reduce GPU core clocks by about 4% on the first 1000 frames of the PTS benchmark. Reviewed-by: Kenneth Graunke <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4338>
-rw-r--r--src/compiler/nir/nir_lower_io.c15
1 files changed, 15 insertions, 0 deletions
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 92d2a1f8ba0..c127802a59d 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -352,6 +352,13 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
}
return nir_vec(b, comp64, intrin->dest.ssa.num_components);
+ } else if (intrin->dest.ssa.bit_size == 1) {
+ /* Booleans are 32-bit */
+ assert(glsl_type_is_boolean(type));
+ return nir_b2b1(&state->builder,
+ emit_load(state, vertex_index, var, offset, component,
+ intrin->dest.ssa.num_components, 32,
+ nir_type_bool32));
} else {
return emit_load(state, vertex_index, var, offset, component,
intrin->dest.ssa.num_components,
@@ -445,6 +452,14 @@ lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state,
write_mask >>= num_comps;
offset = nir_iadd_imm(b, offset, slot_size);
}
+ } else if (intrin->dest.ssa.bit_size == 1) {
+ /* Booleans are 32-bit */
+ assert(glsl_type_is_boolean(type));
+ nir_ssa_def *b32_val = nir_b2b32(&state->builder, intrin->src[1].ssa);
+ emit_store(state, b32_val, vertex_index, var, offset,
+ component, intrin->num_components,
+ nir_intrinsic_write_mask(intrin),
+ nir_type_bool32);
} else {
emit_store(state, intrin->src[1].ssa, vertex_index, var, offset,
component, intrin->num_components,