summaryrefslogtreecommitdiffstats
path: root/src/intel/compiler
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2019-06-04 11:39:25 -0500
committerJason Ekstrand <[email protected]>2019-09-20 18:02:15 +0000
commit03255da225a85315241ea514e5f9e06090b16abe (patch)
treec3a6a757b4e0e9eabef27ba74454fab8dc1d64d4 /src/intel/compiler
parent651725f7a154e9b33093a2d725a0c2581fbd5480 (diff)
intel/fs: Do 8-bit subgroup scan operations in 16 bits
Reviewed-by: Paulo Zanoni <[email protected]>
Diffstat (limited to 'src/intel/compiler')
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp42
1 files changed, 39 insertions, 3 deletions
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index abce29c92f5..f9e81e14f3a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4925,10 +4925,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+ /* There are a couple of register region issues that make things
+ * complicated for 8-bit types:
+ *
+ * 1. Only raw moves are allowed to write to a packed 8-bit
+ * destination.
+ * 2. If we use a strided destination, the efficient way to do scan
+ * operations ends up using strides that are too big to encode in
+ * an instruction.
+ *
+ * To get around these issues, we just do all 8-bit scan operations in
+ * 16 bits. It's actually fewer instructions than what we'd have to do
+ * if we were trying to do it in native 8-bit types and the results are
+ * the same once we truncate to 8 bits at the end.
+ */
+ brw_reg_type scan_type = src.type;
+ if (type_sz(scan_type) == 1)
+ scan_type = brw_reg_type_from_bit_size(16, src.type);
+
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
- fs_reg scan = bld.vgrf(src.type);
+ fs_reg scan = bld.vgrf(scan_type);
bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
@@ -4971,10 +4989,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+ /* There are a couple of register region issues that make things
+ * complicated for 8-bit types:
+ *
+ * 1. Only raw moves are allowed to write to a packed 8-bit
+ * destination.
+ * 2. If we use a strided destination, the efficient way to do scan
+ * operations ends up using strides that are too big to encode in
+ * an instruction.
+ *
+ * To get around these issues, we just do all 8-bit scan operations in
+ * 16 bits. It's actually fewer instructions than what we'd have to do
+ * if we were trying to do it in native 8-bit types and the results are
+ * the same once we truncate to 8 bits at the end.
+ */
+ brw_reg_type scan_type = src.type;
+ if (type_sz(scan_type) == 1)
+ scan_type = brw_reg_type_from_bit_size(16, src.type);
+
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
- fs_reg scan = bld.vgrf(src.type);
+ fs_reg scan = bld.vgrf(scan_type);
const fs_builder allbld = bld.exec_all();
allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
@@ -4983,7 +5019,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
* shift of the contents before we can begin. To make things worse,
* we can't do this with a normal stride; we have to use indirects.
*/
- fs_reg shifted = bld.vgrf(src.type);
+ fs_reg shifted = bld.vgrf(scan_type);
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
brw_imm_w(-1));