diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 225 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 4 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 42 |
3 files changed, 229 insertions, 42 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 8dab4317c10..c1ccd92c2da 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -258,6 +258,26 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index, return instructions; } +/** + * A helper for MOV generation for fixing up broken hardware SEND dependency + * handling. + */ +fs_inst * +fs_visitor::DEP_RESOLVE_MOV(int grf) +{ + fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F)); + + inst->ir = NULL; + inst->annotation = "send dependency resolve"; + + /* The caller always wants uncompressed to emit the minimal extra + * dependencies, and to avoid having to deal with aligning its regs to 2. + */ + inst->force_uncompressed = true; + + return inst; +} + bool fs_inst::equals(fs_inst *inst) { @@ -2228,6 +2248,205 @@ fs_visitor::remove_duplicate_mrf_writes() return progress; } +static void +clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps, + int first_grf, int grf_len) +{ + bool inst_16wide = (dispatch_width > 8 && + !inst->force_uncompressed && + !inst->force_sechalf); + + /* Clear the flag for registers that actually got read (as expected). */ + for (int i = 0; i < 3; i++) { + int grf; + if (inst->src[i].file == GRF) { + grf = inst->src[i].reg; + } else if (inst->src[i].file == FIXED_HW_REG && + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + grf = inst->src[i].fixed_hw_reg.nr; + } else { + continue; + } + + if (grf >= first_grf && + grf < first_grf + grf_len) { + deps[grf - first_grf] = false; + if (inst_16wide) + deps[grf - first_grf + 1] = false; + } + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not + * check for post destination dependencies on this instruction, software + * must ensure that there is no destination hazard for the case of ‘write + * followed by a posted write’ shown in the following example. + * + * 1. mov r3 0 + * 2. send r3.xy <rest of send instruction> + * 3. mov r2 r3 + * + * Due to no post-destination dependency check on the ‘send’, the above + * code sequence could have two instructions (1 and 2) in flight at the + * same time that both consider ‘r3’ as the target of their final writes. + */ +void +fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst) +{ + int write_len = inst->regs_written() * dispatch_width / 8; + int first_write_grf = inst->dst.reg; + bool needs_dep[BRW_MAX_MRF]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + + clear_deps_for_inst_src(inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* Walk backwards looking for writes to registers we're writing which + * aren't read since being written. If we hit the start of the program, + * we assume that there are no outstanding dependencies on entry to the + * program. + */ + for (fs_inst *scan_inst = (fs_inst *)inst->prev; + scan_inst != NULL; + scan_inst = (fs_inst *)scan_inst->prev) { + + /* If we hit control flow, assume that there *are* outstanding + * dependencies, and force their cleanup before our instruction. + */ + if (scan_inst->is_control_flow()) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) { + inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } + } + } + + bool scan_inst_16wide = (dispatch_width > 8 && + !scan_inst->force_uncompressed && + !scan_inst->force_sechalf); + + /* We insert our reads as late as possible on the assumption that any + * instruction but a MOV that might have left us an outstanding + * dependency has more latency than a MOV. + */ + if (scan_inst->dst.file == GRF && + scan_inst->dst.reg >= first_write_grf && + scan_inst->dst.reg < first_write_grf + write_len && + needs_dep[scan_inst->dst.reg - first_write_grf]) { + inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg)); + needs_dep[scan_inst->dst.reg - first_write_grf] = false; + if (scan_inst_16wide) + needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false; + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } +} + +/** + * Implements this workaround for the original 965: + * + * "[DevBW, DevCL] Errata: A destination register from a send can not be + * used as a destination register until after it has been sourced by an + * instruction with a different destination register. + */ +void +fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst) +{ + int write_len = inst->regs_written() * dispatch_width / 8; + int first_write_grf = inst->dst.reg; + bool needs_dep[BRW_MAX_MRF]; + assert(write_len < (int)sizeof(needs_dep) - 1); + + memset(needs_dep, false, sizeof(needs_dep)); + memset(needs_dep, true, write_len); + /* Walk forwards looking for writes to registers we're writing which aren't + * read before being written. + */ + for (fs_inst *scan_inst = (fs_inst *)inst->next; + !scan_inst->is_tail_sentinel(); + scan_inst = (fs_inst *)scan_inst->next) { + /* If we hit control flow, force resolve all remaining dependencies. */ + if (scan_inst->is_control_flow()) { + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } + } + + /* Clear the flag for registers that actually got read (as expected). */ + clear_deps_for_inst_src(scan_inst, dispatch_width, + needs_dep, first_write_grf, write_len); + + /* We insert our reads as late as possible since they're reading the + * result of a SEND, which has massive latency. + */ + if (scan_inst->dst.file == GRF && + scan_inst->dst.reg >= first_write_grf && + scan_inst->dst.reg < first_write_grf + write_len && + needs_dep[scan_inst->dst.reg - first_write_grf]) { + scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg)); + needs_dep[scan_inst->dst.reg - first_write_grf] = false; + } + + /* Continue the loop only if we haven't resolved all the dependencies */ + int i; + for (i = 0; i < write_len; i++) { + if (needs_dep[i]) + break; + } + if (i == write_len) + return; + } + + /* If we hit the end of the program, resolve all remaining dependencies out + * of paranoia. + */ + fs_inst *last_inst = (fs_inst *)this->instructions.get_tail(); + assert(last_inst->eot); + for (int i = 0; i < write_len; i++) { + if (needs_dep[i]) + last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i)); + } +} + +void +fs_visitor::insert_gen4_send_dependency_workarounds() +{ + if (intel->gen != 4 || intel->is_g4x) + return; + + /* Note that we're done with register allocation, so GRF fs_regs always + * have a .reg_offset of 0. + */ + + foreach_list_safe(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->mlen != 0 && inst->dst.file == GRF) { + insert_gen4_pre_send_dependency_workarounds(inst); + insert_gen4_post_send_dependency_workarounds(inst); + } + } +} + void fs_visitor::dump_instruction(fs_inst *inst) { @@ -2522,6 +2741,12 @@ fs_visitor::run() assert(force_uncompressed_stack == 0); assert(force_sechalf_stack == 0); + /* This must come after all optimization and register allocation, since + * it inserts dead code that happens to have side effects, and it does + * so based on the actual physical registers in use. + */ + insert_gen4_send_dependency_workarounds(); + if (failed) return false; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 88fecb90494..d5ebd515cbb 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -285,6 +285,7 @@ public: fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition); fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition); + fs_inst *DEP_RESOLVE_MOV(int grf); int type_size(const struct glsl_type *type); fs_inst *get_instruction_generating_reg(fs_inst *start, @@ -329,6 +330,9 @@ public: bool remove_duplicate_mrf_writes(); bool virtual_grf_interferes(int a, int b); void schedule_instructions(bool post_reg_alloc); + void insert_gen4_send_dependency_workarounds(); + void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst); + void insert_gen4_post_send_dependency_workarounds(fs_inst *inst); void fail(const char *msg, ...); void push_force_uncompressed(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index 62e57c98188..3d1f3b356a8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -604,29 +604,8 @@ fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst) { assert(inst->mlen != 0); - /* Clear any post destination dependencies that would be ignored by - * the block read. See the B-Spec for pre-gen5 send instruction. - * - * This could use a better solution, since texture sampling and - * math reads could potentially run into it as well -- anywhere - * that we have a SEND with a destination that is a register that - * was written but not read within the last N instructions (what's - * N? unsure). This is rare because of dead code elimination, but - * not impossible. - */ - if (intel->gen == 4 && !intel->is_g4x) - brw_MOV(p, brw_null_reg(), dst); - brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1, inst->offset); - - if (intel->gen == 4 && !intel->is_g4x) { - /* gen4 errata: destination from a send can't be used as a - * destination until it's been read. Just read it so we don't - * have to worry. - */ - brw_MOV(p, brw_null_reg(), dst); - } } void @@ -637,19 +616,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, { assert(inst->mlen != 0); - /* Clear any post destination dependencies that would be ignored by - * the block read. See the B-Spec for pre-gen5 send instruction. - * - * This could use a better solution, since texture sampling and - * math reads could potentially run into it as well -- anywhere - * that we have a SEND with a destination that is a register that - * was written but not read within the last N instructions (what's - * N? unsure). This is rare because of dead code elimination, but - * not impossible. - */ - if (intel->gen == 4 && !intel->is_g4x) - brw_MOV(p, brw_null_reg(), dst); - assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); uint32_t surf_index = index.dw1.ud; @@ -660,14 +626,6 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), read_offset, surf_index); - - if (intel->gen == 4 && !intel->is_g4x) { - /* gen4 errata: destination from a send can't be used as a - * destination until it's been read. Just read it so we don't - * have to worry. - */ - brw_MOV(p, brw_null_reg(), dst); - } } void |