aboutsummaryrefslogtreecommitdiffstats
path: root/src/intel/compiler/brw_fs_visitor.cpp
diff options
context:
space:
mode:
authorFrancisco Jerez <[email protected]>2020-01-03 17:08:51 -0800
committerFrancisco Jerez <[email protected]>2020-01-17 13:23:12 -0800
commitb54b67e067da6ed22a7b8112cb6f8bed0e188272 (patch)
treed3beec1fd9d86df7308b11f29cba2522fc5b0fb5 /src/intel/compiler/brw_fs_visitor.cpp
parent79bd252d6e7650f2081d116a51b4baf634338648 (diff)
intel/fs: Switch to standard vector layout for barycentrics at optimization time.
This involves permuting the registers of barycentric vectors to have the standard X[0-n] Y[0-n] layout at NIR translation time. Barycentrics are converted to the format expected by the PLN instruction in the lower_barycentrics() pass run after the optimization loop. Main reason is correctness of SIMD32 fragment shaders. The shuffle_from_pln_layout() and shuffle_to_pln_layout() helpers used during NIR translation are busted for SIMD32. This leads to serious corruption at present with INTEL_DEBUG=do32, especially on Gen11+ where these helpers are hit more frequently due to the lack of a hardware PLN instruction. Of course one could have chosen to fix those helpers instead, but there is another far more subtle issue that was reported during review of the SIMD32 fragment shader codegen changes: The SIMD splitting pass currently handles SIMD32 barycentric vectors as if they had the standard X[0-n] Y[0-n] layout, even though they are interleaved for the PLN instruction, which causes incorrect execution masks to be applied to the MOVs unzipping barycentric vectors in cases where a LINTERP instruction occurs under non-uniform control flow. I'm not aware of any conformance regressions due to the latter issue at present, but for our peace of mind let's move the conversion to the PLN layout into the lower_barycentrics() pass run after lower_simd_width(). This leads to the following shader-db improvements (including SIMD32 shaders) in combination with the previous back-end preparation changes -- Without them (especially the copy propagation changes) this would lead to a massive number of regressions. On ICL: total instructions in shared programs: 20662316 -> 20466903 (-0.95%) instructions in affected programs: 10538474 -> 10343061 (-1.85%) helped: 68775 HURT: 6 total spills in shared programs: 8938 -> 8748 (-2.13%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8965 -> 8663 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 146 GAINED: 43 On SKL: total instructions in shared programs: 18725867 -> 18614912 (-0.59%) instructions in affected programs: 3876590 -> 3765635 (-2.86%) helped: 27492 HURT: 2 LOST: 191 GAINED: 417 On SNB: total instructions in shared programs: 14573613 -> 13980646 (-4.07%) instructions in affected programs: 5199074 -> 4606107 (-11.41%) helped: 29998 HURT: 0 LOST: 21 GAINED: 30 Results are somewhat less impressive but still significant without SIMD32 fragment shaders enabled. On ICL: total instructions in shared programs: 16148728 -> 16061659 (-0.54%) instructions in affected programs: 6114788 -> 6027719 (-1.42%) helped: 42046 HURT: 6 total spills in shared programs: 8218 -> 8028 (-2.31%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8953 -> 8651 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 0 GAINED: 3 On SKL: total instructions in shared programs: 14927994 -> 14926738 (-0.01%) instructions in affected programs: 168850 -> 167594 (-0.74%) helped: 711 HURT: 2 On SNB: total instructions in shared programs: 10770538 -> 10734403 (-0.34%) instructions in affected programs: 2702172 -> 2666037 (-1.34%) helped: 17818 HURT: 0 All of the hurt shaders are either spilling slightly more or emitting additional NOP instructions due to the SIMD16 POW workaround for Gen8-9 combined with differences in scheduling. Reviewed-by: Kenneth Graunke <[email protected]>
Diffstat (limited to 'src/intel/compiler/brw_fs_visitor.cpp')
-rw-r--r--src/intel/compiler/brw_fs_visitor.cpp15
1 files changed, 7 insertions, 8 deletions
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 476a9c64a5b..81d0e466cc7 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -176,11 +176,11 @@ fs_visitor::emit_interpolation_setup_gen4()
const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
- if (devinfo->has_pln && dispatch_width == 16) {
- for (unsigned i = 0; i < 2; i++) {
- abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
+ if (devinfo->has_pln) {
+ for (unsigned i = 0; i < dispatch_width / 8; i++) {
+ abld.half(i).ADD(half(offset(delta_xy, abld, 0), i),
half(this->pixel_x, i), xstart);
- abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
+ abld.half(i).ADD(half(offset(delta_xy, abld, 1), i),
half(this->pixel_y, i), ystart);
}
} else {
@@ -358,11 +358,10 @@ fs_visitor::emit_interpolation_setup_gen6()
for (unsigned c = 0; c < 2; c++) {
for (unsigned q = 0; q < dispatch_width / 8; q++) {
- const unsigned idx = c + (q & 2) + (q & 1) * dispatch_width / 8;
set_predicate(BRW_PREDICATE_NORMAL,
- bld.half(q).SEL(horiz_offset(delta_xy[i], idx * 8),
- horiz_offset(centroid_delta_xy, idx * 8),
- horiz_offset(pixel_delta_xy, idx * 8)));
+ bld.half(q).SEL(half(offset(delta_xy[i], bld, c), q),
+ half(offset(centroid_delta_xy, bld, c), q),
+ half(offset(pixel_delta_xy, bld, c), q)));
}
}
}