diff options
author | Jason Ekstrand <[email protected]> | 2017-08-31 22:12:48 -0700 |
---|---|---|
committer | Jason Ekstrand <[email protected]> | 2018-03-07 12:13:47 -0800 |
commit | 2292b20b2969c9e3e0494ccc55c6216f330762ae (patch) | |
tree | 5024e1852feb8a26ca8cadd4b62bfa12aadf41f3 | |
parent | 4150920b95b8a7db84cd4607ede09f42b85530bb (diff) |
intel/fs: Implement reduce and scan opeprations
Acked-by: Lionel Landwerlin <[email protected]>
Reviewed-by: Iago Toral Quiroga <[email protected]>
-rw-r--r-- | src/intel/compiler/brw_fs_nir.cpp | 154 | ||||
-rw-r--r-- | src/intel/compiler/brw_reg.h | 8 |
2 files changed, 162 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index d2d32f95930..e25fd38af91 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3722,6 +3722,71 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, } } +static fs_reg +brw_nir_reduction_op_identity(const fs_builder &bld, + nir_op op, brw_reg_type type) +{ + nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); + switch (type_sz(type)) { + case 2: + assert(type != BRW_REGISTER_TYPE_HF); + return retype(brw_imm_uw(value.u16[0]), type); + case 4: + return retype(brw_imm_ud(value.u32[0]), type); + case 8: + if (type == BRW_REGISTER_TYPE_DF) + return setup_imm_df(bld, value.f64[0]); + else + return retype(brw_imm_u64(value.u64[0]), type); + default: + unreachable("Invalid type size"); + } +} + +static opcode +brw_op_for_nir_reduction_op(nir_op op) +{ + switch (op) { + case nir_op_iadd: return BRW_OPCODE_ADD; + case nir_op_fadd: return BRW_OPCODE_ADD; + case nir_op_imul: return BRW_OPCODE_MUL; + case nir_op_fmul: return BRW_OPCODE_MUL; + case nir_op_imin: return BRW_OPCODE_SEL; + case nir_op_umin: return BRW_OPCODE_SEL; + case nir_op_fmin: return BRW_OPCODE_SEL; + case nir_op_imax: return BRW_OPCODE_SEL; + case nir_op_umax: return BRW_OPCODE_SEL; + case nir_op_fmax: return BRW_OPCODE_SEL; + case nir_op_iand: return BRW_OPCODE_AND; + case nir_op_ior: return BRW_OPCODE_OR; + case nir_op_ixor: return BRW_OPCODE_XOR; + default: + unreachable("Invalid reduction operation"); + } +} + +static brw_conditional_mod +brw_cond_mod_for_nir_reduction_op(nir_op op) +{ + switch (op) { + case nir_op_iadd: return BRW_CONDITIONAL_NONE; + case nir_op_fadd: return BRW_CONDITIONAL_NONE; + case nir_op_imul: return BRW_CONDITIONAL_NONE; + case nir_op_fmul: return BRW_CONDITIONAL_NONE; + case nir_op_imin: return BRW_CONDITIONAL_L; + case nir_op_umin: return BRW_CONDITIONAL_L; + case nir_op_fmin: return BRW_CONDITIONAL_L; + case nir_op_imax: return BRW_CONDITIONAL_GE; + case nir_op_umax: return BRW_CONDITIONAL_GE; + case nir_op_fmax: return BRW_CONDITIONAL_GE; + case nir_op_iand: return BRW_CONDITIONAL_NONE; + case nir_op_ior: return BRW_CONDITIONAL_NONE; + case nir_op_ixor: return BRW_CONDITIONAL_NONE; + default: + unreachable("Invalid reduction operation"); + } +} + void fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { @@ -4523,6 +4588,95 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_reduce: { + fs_reg src = get_nir_src(instr->src[0]); + nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); + unsigned cluster_size = nir_intrinsic_cluster_size(instr); + if (cluster_size == 0 || cluster_size > dispatch_width) + cluster_size = dispatch_width; + + /* Figure out the source type */ + src.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[redop].input_types[0] | + nir_src_bit_size(instr->src[0]))); + + fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); + opcode brw_op = brw_op_for_nir_reduction_op(redop); + brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + + /* Set up a register for all of our scratching around and initialize it + * to reduction operation's identity value. + */ + fs_reg scan = bld.vgrf(src.type); + bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); + + bld.emit_scan(brw_op, scan, cluster_size, cond_mod); + + dest.type = src.type; + if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { + /* In this case, CLUSTER_BROADCAST instruction isn't needed because + * the distance between clusters is at least 2 GRFs. In this case, + * we don't need the weird striding of the CLUSTER_BROADCAST + * instruction and can just do regular MOVs. + */ + assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); + const unsigned groups = + (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); + const unsigned group_size = dispatch_width / groups; + for (unsigned i = 0; i < groups; i++) { + const unsigned cluster = (i * group_size) / cluster_size; + const unsigned comp = cluster * cluster_size + (cluster_size - 1); + bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), + component(scan, comp)); + } + } else { + bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, + brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); + } + break; + } + + case nir_intrinsic_inclusive_scan: + case nir_intrinsic_exclusive_scan: { + fs_reg src = get_nir_src(instr->src[0]); + nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); + + /* Figure out the source type */ + src.type = brw_type_for_nir_type(devinfo, + (nir_alu_type)(nir_op_infos[redop].input_types[0] | + nir_src_bit_size(instr->src[0]))); + + fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); + opcode brw_op = brw_op_for_nir_reduction_op(redop); + brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); + + /* Set up a register for all of our scratching around and initialize it + * to reduction operation's identity value. + */ + fs_reg scan = bld.vgrf(src.type); + const fs_builder allbld = bld.exec_all(); + allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); + + if (instr->intrinsic == nir_intrinsic_exclusive_scan) { + /* Exclusive scan is a bit harder because we have to do an annoying + * shift of the contents before we can begin. To make things worse, + * we can't do this with a normal stride; we have to use indirects. + */ + fs_reg shifted = bld.vgrf(src.type); + fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); + allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], + brw_imm_w(-1)); + allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); + allbld.group(1, 0).MOV(component(shifted, 0), identity); + scan = shifted; + } + + bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); + + bld.MOV(retype(dest, src.type), scan); + break; + } + default: unreachable("unknown intrinsic"); } diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h index c41408104fa..7ad144bdfd5 100644 --- a/src/intel/compiler/brw_reg.h +++ b/src/intel/compiler/brw_reg.h @@ -590,6 +590,14 @@ brw_imm_df(double df) } static inline struct brw_reg +brw_imm_u64(uint64_t u64) +{ + struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UQ); + imm.u64 = u64; + return imm; +} + +static inline struct brw_reg brw_imm_f(float f) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F); |