summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_defines.h2
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp39
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h7
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_emit.cpp54
4 files changed, 50 insertions, 52 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index d9b7f9aeb15..6414e69892d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -727,7 +727,7 @@ enum opcode {
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
FS_OPCODE_DISCARD_JUMP,
- FS_OPCODE_SET_GLOBAL_OFFSET,
+ FS_OPCODE_SET_SIMD4X2_OFFSET,
FS_OPCODE_PACK_HALF_2x16_SPLIT,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b97a19e0510..5380abfe2f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2461,6 +2461,11 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
* scheduling full flexibility, while the conversion to native instructions
* allows the post-register-allocation scheduler the best information
* possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
*/
void
fs_visitor::lower_uniform_pull_constant_loads()
@@ -2477,26 +2482,24 @@ fs_visitor::lower_uniform_pull_constant_loads()
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
const_offset_reg.imm.u /= 16;
fs_reg payload = fs_reg(this, glsl_type::uint_type);
- struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
- BRW_REGISTER_TYPE_UD);
-
- fs_inst *setup1 = MOV(payload, fs_reg(g0));
- setup1->force_writemask_all = true;
- /* We don't need the second half of this vgrf to be filled with g1
- * in the 16-wide case, but if we use force_uncompressed then live
- * variable analysis won't consider this a def!
+
+ /* This is actually going to be a MOV, but since only the first dword
+ * is accessed, we have a special opcode to do just that one. Note
+ * that this needs to be an operation that will be considered a def
+ * by live variable analysis, or register allocation will explode.
*/
+ fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
+ payload, const_offset_reg);
+ setup->force_writemask_all = true;
- fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
- payload, payload,
- const_offset_reg);
+ setup->ir = inst->ir;
+ setup->annotation = inst->annotation;
+ inst->insert_before(setup);
- setup1->ir = inst->ir;
- setup1->annotation = inst->annotation;
- inst->insert_before(setup1);
- setup2->ir = inst->ir;
- setup2->annotation = inst->annotation;
- inst->insert_before(setup2);
+ /* Similarly, this will only populate the first 4 channels of the
+ * result register (since we only use smear values from 0-3), but we
+ * don't tell the optimizer.
+ */
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
inst->src[1] = payload;
@@ -2533,7 +2536,7 @@ fs_visitor::dump_instruction(fs_inst *inst)
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
printf("uniform_pull_const_gen7");
break;
- case FS_OPCODE_SET_GLOBAL_OFFSET:
+ case FS_OPCODE_SET_SIMD4X2_OFFSET:
printf("set_global_offset");
break;
default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f7ccc7909e2..febd56bfe2e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -546,10 +546,9 @@ private:
struct brw_reg index,
struct brw_reg offset);
void generate_mov_dispatch_to_flags(fs_inst *inst);
- void generate_set_global_offset(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src,
- struct brw_reg offset);
+ void generate_set_simd4x2_offset(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg offset);
void generate_discard_jump(fs_inst *inst);
void generate_pack_half_2x16_split(fs_inst *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 2391ad12026..712fef6e093 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -647,6 +647,8 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
uint32_t surf_index = index.dw1.ud;
assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+ /* Reference just the dword we need, to avoid angering validate_reg(). */
+ offset = brw_vec1_grf(offset.nr, 0);
brw_push_insn_state(p);
brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -654,20 +656,22 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
brw_pop_insn_state(p);
+ /* We use the SIMD4x2 mode because we want to end up with 4 components in
+ * the destination loaded consecutively from the same offset (which appears
+ * in the first component, and the rest are ignored).
+ */
+ dst.width = BRW_WIDTH_4;
brw_set_dest(p, send, dst);
brw_set_src0(p, send, offset);
-
- uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
- uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
- bool header_present = true;
- brw_set_dp_read_message(p, send,
+ brw_set_sampler_message(p, send,
surf_index,
- msg_control,
- msg_type,
- BRW_DATAPORT_READ_TARGET_DATA_CACHE,
- 1,
- header_present,
- 1);
+ 0, /* LD message ignores sampler unit */
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ 1, /* rlen */
+ 1, /* mlen */
+ false, /* no header */
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ 0);
}
void
@@ -858,31 +862,23 @@ brw_reg_from_fs_reg(fs_reg *reg)
}
/**
- * Sets the second dword of a vgrf for gen7+ message setup.
+ * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
+ * sampler LD messages.
*
- * For setting up gen7 messages in VGRFs, we need to be able to set the second
- * dword for some payloads where in the MRF world we'd have just used
- * brw_message_reg(). We don't want to bake it into the send message's code
- * generation because that means we don't get a chance to schedule the
- * instructions.
+ * We don't want to bake it into the send message's code generation because
+ * that means we don't get a chance to schedule the instructions.
*/
void
-fs_generator::generate_set_global_offset(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src,
- struct brw_reg value)
+fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg value)
{
- /* We use a matching src and dst to get the information on how this
- * instruction works exposed to various optimization passes that would
- * otherwise treat it as completely overwriting the dst.
- */
- assert(src.file == dst.file && src.nr == dst.nr);
assert(value.file == BRW_IMMEDIATE_VALUE);
brw_push_insn_state(p);
brw_set_compression_control(p, BRW_COMPRESSION_NONE);
brw_set_mask_control(p, BRW_MASK_DISABLE);
- brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+ brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
brw_pop_insn_state(p);
}
@@ -1298,8 +1294,8 @@ fs_generator::generate_code(exec_list *instructions)
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
break;
- case FS_OPCODE_SET_GLOBAL_OFFSET:
- generate_set_global_offset(inst, dst, src[0], src[1]);
+ case FS_OPCODE_SET_SIMD4X2_OFFSET:
+ generate_set_simd4x2_offset(inst, dst, src[0]);
break;
case FS_OPCODE_PACK_HALF_2x16_SPLIT: