summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp225
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h4
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_emit.cpp42
3 files changed, 229 insertions, 42 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8dab4317c10..c1ccd92c2da 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -258,6 +258,26 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
return instructions;
}
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+fs_inst *
+fs_visitor::DEP_RESOLVE_MOV(int grf)
+{
+ fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
+
+ inst->ir = NULL;
+ inst->annotation = "send dependency resolve";
+
+ /* The caller always wants uncompressed to emit the minimal extra
+ * dependencies, and to avoid having to deal with aligning its regs to 2.
+ */
+ inst->force_uncompressed = true;
+
+ return inst;
+}
+
bool
fs_inst::equals(fs_inst *inst)
{
@@ -2228,6 +2248,205 @@ fs_visitor::remove_duplicate_mrf_writes()
return progress;
}
+static void
+clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
+ int first_grf, int grf_len)
+{
+ bool inst_16wide = (dispatch_width > 8 &&
+ !inst->force_uncompressed &&
+ !inst->force_sechalf);
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ for (int i = 0; i < 3; i++) {
+ int grf;
+ if (inst->src[i].file == GRF) {
+ grf = inst->src[i].reg;
+ } else if (inst->src[i].file == FIXED_HW_REG &&
+ inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+ grf = inst->src[i].fixed_hw_reg.nr;
+ } else {
+ continue;
+ }
+
+ if (grf >= first_grf &&
+ grf < first_grf + grf_len) {
+ deps[grf - first_grf] = false;
+ if (inst_16wide)
+ deps[grf - first_grf + 1] = false;
+ }
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ * check for post destination dependencies on this instruction, software
+ * must ensure that there is no destination hazard for the case of ‘write
+ * followed by a posted write’ shown in the following example.
+ *
+ * 1. mov r3 0
+ * 2. send r3.xy <rest of send instruction>
+ * 3. mov r2 r3
+ *
+ * Due to no post-destination dependency check on the ‘send’, the above
+ * code sequence could have two instructions (1 and 2) in flight at the
+ * same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+{
+ int write_len = inst->regs_written() * dispatch_width / 8;
+ int first_write_grf = inst->dst.reg;
+ bool needs_dep[BRW_MAX_MRF];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+
+ clear_deps_for_inst_src(inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* Walk backwards looking for writes to registers we're writing which
+ * aren't read since being written. If we hit the start of the program,
+ * we assume that there are no outstanding dependencies on entry to the
+ * program.
+ */
+ for (fs_inst *scan_inst = (fs_inst *)inst->prev;
+ scan_inst != NULL;
+ scan_inst = (fs_inst *)scan_inst->prev) {
+
+ /* If we hit control flow, assume that there *are* outstanding
+ * dependencies, and force their cleanup before our instruction.
+ */
+ if (scan_inst->is_control_flow()) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i]) {
+ inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+ }
+ }
+
+ bool scan_inst_16wide = (dispatch_width > 8 &&
+ !scan_inst->force_uncompressed &&
+ !scan_inst->force_sechalf);
+
+ /* We insert our reads as late as possible on the assumption that any
+ * instruction but a MOV that might have left us an outstanding
+ * dependency has more latency than a MOV.
+ */
+ if (scan_inst->dst.file == GRF &&
+ scan_inst->dst.reg >= first_write_grf &&
+ scan_inst->dst.reg < first_write_grf + write_len &&
+ needs_dep[scan_inst->dst.reg - first_write_grf]) {
+ inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+ if (scan_inst_16wide)
+ needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ * "[DevBW, DevCL] Errata: A destination register from a send can not be
+ * used as a destination register until after it has been sourced by an
+ * instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
+{
+ int write_len = inst->regs_written() * dispatch_width / 8;
+ int first_write_grf = inst->dst.reg;
+ bool needs_dep[BRW_MAX_MRF];
+ assert(write_len < (int)sizeof(needs_dep) - 1);
+
+ memset(needs_dep, false, sizeof(needs_dep));
+ memset(needs_dep, true, write_len);
+ /* Walk forwards looking for writes to registers we're writing which aren't
+ * read before being written.
+ */
+ for (fs_inst *scan_inst = (fs_inst *)inst->next;
+ !scan_inst->is_tail_sentinel();
+ scan_inst = (fs_inst *)scan_inst->next) {
+ /* If we hit control flow, force resolve all remaining dependencies. */
+ if (scan_inst->is_control_flow()) {
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+ }
+
+ /* Clear the flag for registers that actually got read (as expected). */
+ clear_deps_for_inst_src(scan_inst, dispatch_width,
+ needs_dep, first_write_grf, write_len);
+
+ /* We insert our reads as late as possible since they're reading the
+ * result of a SEND, which has massive latency.
+ */
+ if (scan_inst->dst.file == GRF &&
+ scan_inst->dst.reg >= first_write_grf &&
+ scan_inst->dst.reg < first_write_grf + write_len &&
+ needs_dep[scan_inst->dst.reg - first_write_grf]) {
+ scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
+ needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+ }
+
+ /* Continue the loop only if we haven't resolved all the dependencies */
+ int i;
+ for (i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ break;
+ }
+ if (i == write_len)
+ return;
+ }
+
+ /* If we hit the end of the program, resolve all remaining dependencies out
+ * of paranoia.
+ */
+ fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
+ assert(last_inst->eot);
+ for (int i = 0; i < write_len; i++) {
+ if (needs_dep[i])
+ last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
+ }
+}
+
+void
+fs_visitor::insert_gen4_send_dependency_workarounds()
+{
+ if (intel->gen != 4 || intel->is_g4x)
+ return;
+
+ /* Note that we're done with register allocation, so GRF fs_regs always
+ * have a .reg_offset of 0.
+ */
+
+ foreach_list_safe(node, &this->instructions) {
+ fs_inst *inst = (fs_inst *)node;
+
+ if (inst->mlen != 0 && inst->dst.file == GRF) {
+ insert_gen4_pre_send_dependency_workarounds(inst);
+ insert_gen4_post_send_dependency_workarounds(inst);
+ }
+ }
+}
+
void
fs_visitor::dump_instruction(fs_inst *inst)
{
@@ -2522,6 +2741,12 @@ fs_visitor::run()
assert(force_uncompressed_stack == 0);
assert(force_sechalf_stack == 0);
+ /* This must come after all optimization and register allocation, since
+ * it inserts dead code that happens to have side effects, and it does
+ * so based on the actual physical registers in use.
+ */
+ insert_gen4_send_dependency_workarounds();
+
if (failed)
return false;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 88fecb90494..d5ebd515cbb 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -285,6 +285,7 @@ public:
fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition);
fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
uint32_t condition);
+ fs_inst *DEP_RESOLVE_MOV(int grf);
int type_size(const struct glsl_type *type);
fs_inst *get_instruction_generating_reg(fs_inst *start,
@@ -329,6 +330,9 @@ public:
bool remove_duplicate_mrf_writes();
bool virtual_grf_interferes(int a, int b);
void schedule_instructions(bool post_reg_alloc);
+ void insert_gen4_send_dependency_workarounds();
+ void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst);
+ void insert_gen4_post_send_dependency_workarounds(fs_inst *inst);
void fail(const char *msg, ...);
void push_force_uncompressed();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 62e57c98188..3d1f3b356a8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -604,29 +604,8 @@ fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
{
assert(inst->mlen != 0);
- /* Clear any post destination dependencies that would be ignored by
- * the block read. See the B-Spec for pre-gen5 send instruction.
- *
- * This could use a better solution, since texture sampling and
- * math reads could potentially run into it as well -- anywhere
- * that we have a SEND with a destination that is a register that
- * was written but not read within the last N instructions (what's
- * N? unsure). This is rare because of dead code elimination, but
- * not impossible.
- */
- if (intel->gen == 4 && !intel->is_g4x)
- brw_MOV(p, brw_null_reg(), dst);
-
brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
inst->offset);
-
- if (intel->gen == 4 && !intel->is_g4x) {
- /* gen4 errata: destination from a send can't be used as a
- * destination until it's been read. Just read it so we don't
- * have to worry.
- */
- brw_MOV(p, brw_null_reg(), dst);
- }
}
void
@@ -637,19 +616,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
{
assert(inst->mlen != 0);
- /* Clear any post destination dependencies that would be ignored by
- * the block read. See the B-Spec for pre-gen5 send instruction.
- *
- * This could use a better solution, since texture sampling and
- * math reads could potentially run into it as well -- anywhere
- * that we have a SEND with a destination that is a register that
- * was written but not read within the last N instructions (what's
- * N? unsure). This is rare because of dead code elimination, but
- * not impossible.
- */
- if (intel->gen == 4 && !intel->is_g4x)
- brw_MOV(p, brw_null_reg(), dst);
-
assert(index.file == BRW_IMMEDIATE_VALUE &&
index.type == BRW_REGISTER_TYPE_UD);
uint32_t surf_index = index.dw1.ud;
@@ -660,14 +626,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
read_offset, surf_index);
-
- if (intel->gen == 4 && !intel->is_g4x) {
- /* gen4 errata: destination from a send can't be used as a
- * destination until it's been read. Just read it so we don't
- * have to worry.
- */
- brw_MOV(p, brw_null_reg(), dst);
- }
}
void