summaryrefslogtreecommitdiffstats
path: root/src/intel
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2017-08-31 22:12:48 -0700
committerJason Ekstrand <[email protected]>2018-03-07 12:13:47 -0800
commit2292b20b2969c9e3e0494ccc55c6216f330762ae (patch)
tree5024e1852feb8a26ca8cadd4b62bfa12aadf41f3 /src/intel
parent4150920b95b8a7db84cd4607ede09f42b85530bb (diff)
intel/fs: Implement reduce and scan opeprations
Acked-by: Lionel Landwerlin <[email protected]> Reviewed-by: Iago Toral Quiroga <[email protected]>
Diffstat (limited to 'src/intel')
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp154
-rw-r--r--src/intel/compiler/brw_reg.h8
2 files changed, 162 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index d2d32f95930..e25fd38af91 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3722,6 +3722,71 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
}
}
+static fs_reg
+brw_nir_reduction_op_identity(const fs_builder &bld,
+ nir_op op, brw_reg_type type)
+{
+ nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
+ switch (type_sz(type)) {
+ case 2:
+ assert(type != BRW_REGISTER_TYPE_HF);
+ return retype(brw_imm_uw(value.u16[0]), type);
+ case 4:
+ return retype(brw_imm_ud(value.u32[0]), type);
+ case 8:
+ if (type == BRW_REGISTER_TYPE_DF)
+ return setup_imm_df(bld, value.f64[0]);
+ else
+ return retype(brw_imm_u64(value.u64[0]), type);
+ default:
+ unreachable("Invalid type size");
+ }
+}
+
+static opcode
+brw_op_for_nir_reduction_op(nir_op op)
+{
+ switch (op) {
+ case nir_op_iadd: return BRW_OPCODE_ADD;
+ case nir_op_fadd: return BRW_OPCODE_ADD;
+ case nir_op_imul: return BRW_OPCODE_MUL;
+ case nir_op_fmul: return BRW_OPCODE_MUL;
+ case nir_op_imin: return BRW_OPCODE_SEL;
+ case nir_op_umin: return BRW_OPCODE_SEL;
+ case nir_op_fmin: return BRW_OPCODE_SEL;
+ case nir_op_imax: return BRW_OPCODE_SEL;
+ case nir_op_umax: return BRW_OPCODE_SEL;
+ case nir_op_fmax: return BRW_OPCODE_SEL;
+ case nir_op_iand: return BRW_OPCODE_AND;
+ case nir_op_ior: return BRW_OPCODE_OR;
+ case nir_op_ixor: return BRW_OPCODE_XOR;
+ default:
+ unreachable("Invalid reduction operation");
+ }
+}
+
+static brw_conditional_mod
+brw_cond_mod_for_nir_reduction_op(nir_op op)
+{
+ switch (op) {
+ case nir_op_iadd: return BRW_CONDITIONAL_NONE;
+ case nir_op_fadd: return BRW_CONDITIONAL_NONE;
+ case nir_op_imul: return BRW_CONDITIONAL_NONE;
+ case nir_op_fmul: return BRW_CONDITIONAL_NONE;
+ case nir_op_imin: return BRW_CONDITIONAL_L;
+ case nir_op_umin: return BRW_CONDITIONAL_L;
+ case nir_op_fmin: return BRW_CONDITIONAL_L;
+ case nir_op_imax: return BRW_CONDITIONAL_GE;
+ case nir_op_umax: return BRW_CONDITIONAL_GE;
+ case nir_op_fmax: return BRW_CONDITIONAL_GE;
+ case nir_op_iand: return BRW_CONDITIONAL_NONE;
+ case nir_op_ior: return BRW_CONDITIONAL_NONE;
+ case nir_op_ixor: return BRW_CONDITIONAL_NONE;
+ default:
+ unreachable("Invalid reduction operation");
+ }
+}
+
void
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
{
@@ -4523,6 +4588,95 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_reduce: {
+ fs_reg src = get_nir_src(instr->src[0]);
+ nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+ unsigned cluster_size = nir_intrinsic_cluster_size(instr);
+ if (cluster_size == 0 || cluster_size > dispatch_width)
+ cluster_size = dispatch_width;
+
+ /* Figure out the source type */
+ src.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+ nir_src_bit_size(instr->src[0])));
+
+ fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+ opcode brw_op = brw_op_for_nir_reduction_op(redop);
+ brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+ /* Set up a register for all of our scratching around and initialize it
+ * to reduction operation's identity value.
+ */
+ fs_reg scan = bld.vgrf(src.type);
+ bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+ bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
+
+ dest.type = src.type;
+ if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
+ /* In this case, CLUSTER_BROADCAST instruction isn't needed because
+ * the distance between clusters is at least 2 GRFs. In this case,
+ * we don't need the weird striding of the CLUSTER_BROADCAST
+ * instruction and can just do regular MOVs.
+ */
+ assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
+ const unsigned groups =
+ (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
+ const unsigned group_size = dispatch_width / groups;
+ for (unsigned i = 0; i < groups; i++) {
+ const unsigned cluster = (i * group_size) / cluster_size;
+ const unsigned comp = cluster * cluster_size + (cluster_size - 1);
+ bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
+ component(scan, comp));
+ }
+ } else {
+ bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
+ brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
+ }
+ break;
+ }
+
+ case nir_intrinsic_inclusive_scan:
+ case nir_intrinsic_exclusive_scan: {
+ fs_reg src = get_nir_src(instr->src[0]);
+ nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
+
+ /* Figure out the source type */
+ src.type = brw_type_for_nir_type(devinfo,
+ (nir_alu_type)(nir_op_infos[redop].input_types[0] |
+ nir_src_bit_size(instr->src[0])));
+
+ fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
+ opcode brw_op = brw_op_for_nir_reduction_op(redop);
+ brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
+
+ /* Set up a register for all of our scratching around and initialize it
+ * to reduction operation's identity value.
+ */
+ fs_reg scan = bld.vgrf(src.type);
+ const fs_builder allbld = bld.exec_all();
+ allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
+
+ if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
+ /* Exclusive scan is a bit harder because we have to do an annoying
+ * shift of the contents before we can begin. To make things worse,
+ * we can't do this with a normal stride; we have to use indirects.
+ */
+ fs_reg shifted = bld.vgrf(src.type);
+ fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
+ allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
+ brw_imm_w(-1));
+ allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
+ allbld.group(1, 0).MOV(component(shifted, 0), identity);
+ scan = shifted;
+ }
+
+ bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
+
+ bld.MOV(retype(dest, src.type), scan);
+ break;
+ }
+
default:
unreachable("unknown intrinsic");
}
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index c41408104fa..7ad144bdfd5 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -590,6 +590,14 @@ brw_imm_df(double df)
}
static inline struct brw_reg
+brw_imm_u64(uint64_t u64)
+{
+ struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ);
+ imm.u64 = u64;
+ return imm;
+}
+
+static inline struct brw_reg
brw_imm_f(float f)
{
struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);