From 90c9f29518d32a29725b114f3b16ad8c62a812ff Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 29 Aug 2017 09:21:32 -0700 Subject: i965/fs: Add support for nir_intrinsic_shuffle Reviewed-by: Iago Toral Quiroga --- src/intel/compiler/brw_eu_defines.h | 9 +++ src/intel/compiler/brw_fs.cpp | 21 +++++++ src/intel/compiler/brw_fs.h | 5 ++ src/intel/compiler/brw_fs_generator.cpp | 104 ++++++++++++++++++++++++++++++++ src/intel/compiler/brw_fs_nir.cpp | 8 +++ src/intel/compiler/brw_nir.c | 1 + src/intel/compiler/brw_shader.cpp | 2 + 7 files changed, 150 insertions(+) (limited to 'src') diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 30e2e8f0708..3449c73d771 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -451,6 +451,15 @@ enum opcode { */ SHADER_OPCODE_BROADCAST, + /* Pick the channel from its first source register given by the index + * specified as second source. + * + * This is similar to the BROADCAST instruction except that it takes a + * dynamic index and potentially puts a different value in each output + * channel. + */ + SHADER_OPCODE_SHUFFLE, + SHADER_OPCODE_GET_BUFFER_SIZE, VEC4_OPCODE_MOV_BYTES, diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index fc4cfbfb0be..47f1f6e9c9f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const case FS_OPCODE_PACK_HALF_2x16_SPLIT: /* Multiple partial writes to the destination */ return true; + case SHADER_OPCODE_SHUFFLE: + /* This instruction returns an arbitrary channel from the source and + * gets split into smaller instructions in the generator. It's possible + * that one of the instructions will read from a channel corresponding + * to an earlier instruction. + */ + return true; default: /* The SIMD16 compressed instruction * @@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic() } break; + case SHADER_OPCODE_SHUFFLE: + if (is_uniform(inst->src[0])) { + inst->opcode = BRW_OPCODE_MOV; + inst->sources = 1; + progress = true; + } else if (inst->src[1].file == IMM) { + inst->opcode = BRW_OPCODE_MOV; + inst->src[0] = component(inst->src[0], + inst->src[1].ud); + inst->sources = 1; + progress = true; + } + break; + default: break; } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index b0799a0f5e2..1b7df844696 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -471,6 +471,11 @@ private: struct brw_reg reg, struct brw_reg indirect_byte_offset); + void generate_shuffle(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx); + bool patch_discard_jumps_to_fb_writes(); const struct brw_compiler *compiler; diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index df34099713b..9b8f8ce683e 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -540,6 +540,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst, } } +void +fs_generator::generate_shuffle(fs_inst *inst, + struct brw_reg dst, + struct brw_reg src, + struct brw_reg idx) +{ + /* Ivy bridge has some strange behavior that makes this a real pain to + * implement for 64-bit values so we just don't bother. + */ + assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4); + + /* Because we're using the address register, we're limited to 8-wide + * execution on gen7. On gen8, we're limited to 16-wide by the address + * register file and 8-wide for 64-bit types. We could try and make this + * instruction splittable higher up in the compiler but that gets weird + * because it reads all of the channels regardless of execution size. It's + * easier just to split it here. + */ + const unsigned lower_width = + (devinfo->gen <= 7 || type_sz(src.type) > 4) ? + 8 : MIN2(16, inst->exec_size); + + brw_set_default_exec_size(p, cvt(lower_width) - 1); + for (unsigned group = 0; group < inst->exec_size; group += lower_width) { + brw_set_default_group(p, group); + + if ((src.vstride == 0 && src.hstride == 0) || + idx.file == BRW_IMMEDIATE_VALUE) { + /* Trivial, the source is already uniform or the index is a constant. + * We will typically not get here if the optimizer is doing its job, + * but asserting would be mean. + */ + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; + brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0)); + } else { + /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ + struct brw_reg addr = vec8(brw_address_reg(0)); + + struct brw_reg group_idx = suboffset(idx, group); + + if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) { + /* Things get grumpy if the register is too wide. */ + group_idx.width--; + group_idx.vstride--; + } + + assert(type_sz(group_idx.type) <= 4); + if (type_sz(group_idx.type) == 4) { + /* The destination stride of an instruction (in bytes) must be + * greater than or equal to the size of the rest of the + * instruction. Since the address register is of type UW, we + * can't use a D-type instruction. In order to get around this, + * re retype to UW and use a stride. + */ + group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W); + } + + /* Take into account the component size and horizontal stride. */ + assert(src.vstride == src.hstride + src.width); + brw_SHL(p, addr, group_idx, + brw_imm_uw(_mesa_logbase2(type_sz(src.type)) + + src.hstride - 1)); + + /* Add on the register start offset */ + brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr)); + + if (type_sz(src.type) > 4 && + ((devinfo->gen == 7 && !devinfo->is_haswell) || + devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { + /* IVB has an issue (which we found empirically) where it reads + * two address register components per channel for indirectly + * addressed 64-bit sources. + * + * From the Cherryview PRM Vol 7. "Register Region Restrictions": + * + * "When source or destination datatype is 64b or operation is + * integer DWord multiply, indirect addressing must not be + * used." + * + * To work around both of these, we do two integer MOVs insead of + * one 64-bit MOV. Because no double value should ever cross a + * register boundary, it's safe to use the immediate offset in the + * indirect here to handle adding 4 bytes to the offset and avoid + * the extra ADD to the register file. + */ + struct brw_reg gdst = suboffset(dst, group); + struct brw_reg dst_d = retype(spread(gdst, 2), + BRW_REGISTER_TYPE_D); + brw_MOV(p, dst_d, + retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D)); + brw_MOV(p, byte_offset(dst_d, 4), + retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D)); + } else { + brw_MOV(p, suboffset(dst, group), + retype(brw_VxH_indirect(0, 0), src.type)); + } + } + } +} + void fs_generator::generate_urb_read(fs_inst *inst, struct brw_reg dst, @@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_SHUFFLE: + generate_shuffle(inst, dst, src[0], src[1]); + break; + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 71e871c500a..d2d32f95930 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_shuffle: { + const fs_reg value = get_nir_src(instr->src[0]); + const fs_reg index = get_nir_src(instr->src[1]); + + bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); + break; + } + case nir_intrinsic_first_invocation: { fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index cbfafd4db02..dbad4a14b17 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) .lower_to_scalar = true, .lower_subgroup_masks = true, .lower_vote_trivial = !is_scalar, + .lower_shuffle = true, }; OPT(nir_lower_subgroups, &subgroups_options); diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index abfad4e54c3..b1227e17e2c 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op) return "find_live_channel"; case SHADER_OPCODE_BROADCAST: return "broadcast"; + case SHADER_OPCODE_SHUFFLE: + return "shuffle"; case SHADER_OPCODE_GET_BUFFER_SIZE: return "get_buffer_size"; -- cgit v1.2.3