summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2017-08-29 09:21:32 -0700
committerJason Ekstrand <[email protected]>2018-03-07 12:13:47 -0800
commit90c9f29518d32a29725b114f3b16ad8c62a812ff (patch)
tree236302586f2998c6788f92dd903b5341f3d61ae8
parent8256ee3fa363064ac3bd824d436aced81c61d23f (diff)
i965/fs: Add support for nir_intrinsic_shuffle
Reviewed-by: Iago Toral Quiroga <[email protected]>
-rw-r--r--src/intel/compiler/brw_eu_defines.h9
-rw-r--r--src/intel/compiler/brw_fs.cpp21
-rw-r--r--src/intel/compiler/brw_fs.h5
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp104
-rw-r--r--src/intel/compiler/brw_fs_nir.cpp8
-rw-r--r--src/intel/compiler/brw_nir.c1
-rw-r--r--src/intel/compiler/brw_shader.cpp2
7 files changed, 150 insertions, 0 deletions
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 30e2e8f0708..3449c73d771 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -451,6 +451,15 @@ enum opcode {
*/
SHADER_OPCODE_BROADCAST,
+ /* Pick the channel from its first source register given by the index
+ * specified as second source.
+ *
+ * This is similar to the BROADCAST instruction except that it takes a
+ * dynamic index and potentially puts a different value in each output
+ * channel.
+ */
+ SHADER_OPCODE_SHUFFLE,
+
SHADER_OPCODE_GET_BUFFER_SIZE,
VEC4_OPCODE_MOV_BYTES,
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index fc4cfbfb0be..47f1f6e9c9f 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -310,6 +310,13 @@ fs_inst::has_source_and_destination_hazard() const
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
/* Multiple partial writes to the destination */
return true;
+ case SHADER_OPCODE_SHUFFLE:
+ /* This instruction returns an arbitrary channel from the source and
+ * gets split into smaller instructions in the generator. It's possible
+ * that one of the instructions will read from a channel corresponding
+ * to an earlier instruction.
+ */
+ return true;
default:
/* The SIMD16 compressed instruction
*
@@ -2531,6 +2538,20 @@ fs_visitor::opt_algebraic()
}
break;
+ case SHADER_OPCODE_SHUFFLE:
+ if (is_uniform(inst->src[0])) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->sources = 1;
+ progress = true;
+ } else if (inst->src[1].file == IMM) {
+ inst->opcode = BRW_OPCODE_MOV;
+ inst->src[0] = component(inst->src[0],
+ inst->src[1].ud);
+ inst->sources = 1;
+ progress = true;
+ }
+ break;
+
default:
break;
}
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b0799a0f5e2..1b7df844696 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -471,6 +471,11 @@ private:
struct brw_reg reg,
struct brw_reg indirect_byte_offset);
+ void generate_shuffle(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg idx);
+
bool patch_discard_jumps_to_fb_writes();
const struct brw_compiler *compiler;
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index df34099713b..9b8f8ce683e 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -541,6 +541,106 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
}
void
+fs_generator::generate_shuffle(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg idx)
+{
+ /* Ivy bridge has some strange behavior that makes this a real pain to
+ * implement for 64-bit values so we just don't bother.
+ */
+ assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
+
+ /* Because we're using the address register, we're limited to 8-wide
+ * execution on gen7. On gen8, we're limited to 16-wide by the address
+ * register file and 8-wide for 64-bit types. We could try and make this
+ * instruction splittable higher up in the compiler but that gets weird
+ * because it reads all of the channels regardless of execution size. It's
+ * easier just to split it here.
+ */
+ const unsigned lower_width =
+ (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
+ 8 : MIN2(16, inst->exec_size);
+
+ brw_set_default_exec_size(p, cvt(lower_width) - 1);
+ for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
+ brw_set_default_group(p, group);
+
+ if ((src.vstride == 0 && src.hstride == 0) ||
+ idx.file == BRW_IMMEDIATE_VALUE) {
+ /* Trivial, the source is already uniform or the index is a constant.
+ * We will typically not get here if the optimizer is doing its job,
+ * but asserting would be mean.
+ */
+ const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+ brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
+ } else {
+ /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+ struct brw_reg addr = vec8(brw_address_reg(0));
+
+ struct brw_reg group_idx = suboffset(idx, group);
+
+ if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
+ /* Things get grumpy if the register is too wide. */
+ group_idx.width--;
+ group_idx.vstride--;
+ }
+
+ assert(type_sz(group_idx.type) <= 4);
+ if (type_sz(group_idx.type) == 4) {
+ /* The destination stride of an instruction (in bytes) must be
+ * greater than or equal to the size of the rest of the
+ * instruction. Since the address register is of type UW, we
+ * can't use a D-type instruction. In order to get around this,
+ * re retype to UW and use a stride.
+ */
+ group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
+ }
+
+ /* Take into account the component size and horizontal stride. */
+ assert(src.vstride == src.hstride + src.width);
+ brw_SHL(p, addr, group_idx,
+ brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
+ src.hstride - 1));
+
+ /* Add on the register start offset */
+ brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
+
+ if (type_sz(src.type) > 4 &&
+ ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+ devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ /* IVB has an issue (which we found empirically) where it reads
+ * two address register components per channel for indirectly
+ * addressed 64-bit sources.
+ *
+ * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+ *
+ * "When source or destination datatype is 64b or operation is
+ * integer DWord multiply, indirect addressing must not be
+ * used."
+ *
+ * To work around both of these, we do two integer MOVs insead of
+ * one 64-bit MOV. Because no double value should ever cross a
+ * register boundary, it's safe to use the immediate offset in the
+ * indirect here to handle adding 4 bytes to the offset and avoid
+ * the extra ADD to the register file.
+ */
+ struct brw_reg gdst = suboffset(dst, group);
+ struct brw_reg dst_d = retype(spread(gdst, 2),
+ BRW_REGISTER_TYPE_D);
+ brw_MOV(p, dst_d,
+ retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_MOV(p, byte_offset(dst_d, 4),
+ retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+ } else {
+ brw_MOV(p, suboffset(dst, group),
+ retype(brw_VxH_indirect(0, 0), src.type));
+ }
+ }
+ }
+}
+
+void
fs_generator::generate_urb_read(fs_inst *inst,
struct brw_reg dst,
struct brw_reg header)
@@ -2189,6 +2289,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
brw_broadcast(p, dst, src[0], src[1]);
break;
+ case SHADER_OPCODE_SHUFFLE:
+ generate_shuffle(inst, dst, src[0], src[1]);
+ break;
+
case FS_OPCODE_SET_SAMPLE_ID:
generate_set_sample_id(inst, dst, src[0], src[1]);
break;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 71e871c500a..d2d32f95930 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4507,6 +4507,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
break;
}
+ case nir_intrinsic_shuffle: {
+ const fs_reg value = get_nir_src(instr->src[0]);
+ const fs_reg index = get_nir_src(instr->src[1]);
+
+ bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
+ break;
+ }
+
case nir_intrinsic_first_invocation: {
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index cbfafd4db02..dbad4a14b17 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -656,6 +656,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
.lower_to_scalar = true,
.lower_subgroup_masks = true,
.lower_vote_trivial = !is_scalar,
+ .lower_shuffle = true,
};
OPT(nir_lower_subgroups, &subgroups_options);
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index abfad4e54c3..b1227e17e2c 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -330,6 +330,8 @@ brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
return "find_live_channel";
case SHADER_OPCODE_BROADCAST:
return "broadcast";
+ case SHADER_OPCODE_SHUFFLE:
+ return "shuffle";
case SHADER_OPCODE_GET_BUFFER_SIZE:
return "get_buffer_size";