diff options
author | Eric Anholt <[email protected]> | 2010-10-13 20:17:15 -0700 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2010-10-14 10:42:55 -0700 |
commit | 4f88550ba0e1ad07e39903f268975921c0101e85 (patch) | |
tree | f8537e189a0786975a1de725a27861852b59d023 /src | |
parent | b8613d70da34217b98edb9ac9e0a4c9a6598d0b3 (diff) |
i965: Add a pass to the FS to split virtual GRFs to float channels.
Improves nexuiz performance 0.91% (+/- 0.54%, n=8)
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 93 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 25 |
2 files changed, 116 insertions, 2 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index e2c7dbde6a5..41081c3b634 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2111,6 +2111,7 @@ static void assign_reg(int *reg_hw_locations, fs_reg *reg) { if (reg->file == GRF && reg->reg != 0) { + assert(reg->reg_offset >= 0); reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset; reg->reg = 0; } @@ -2302,7 +2303,7 @@ fs_visitor::assign_regs() } } - assert(hw_reg != -1); + assert(hw_reg >= 0); hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg; last_grf = MAX2(last_grf, hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1); @@ -2322,6 +2323,92 @@ fs_visitor::assign_regs() talloc_free(regs); } +/** + * Split large virtual GRFs into separate components if we can. + * + * This is mostly duplicated with what brw_fs_vector_splitting does, + * but that's really conservative because it's afraid of doing + * splitting that doesn't result in real progress after the rest of + * the optimization phases, which would cause infinite looping in + * optimization. We can do it once here, safely. This also has the + * opportunity to split interpolated values, or maybe even uniforms, + * which we don't have at the IR level. + * + * We want to split, because virtual GRFs are what we register + * allocate and spill (due to contiguousness requirements for some + * instructions), and they're what we naturally generate in the + * codegen process, but most virtual GRFs don't actually need to be + * contiguous sets of GRFs. If we split, we'll end up with reduced + * live intervals and better dead code elimination and coalescing. + */ +void +fs_visitor::split_virtual_grfs() +{ + int num_vars = this->virtual_grf_next; + bool split_grf[num_vars]; + int new_virtual_grf[num_vars]; + + /* Try to split anything > 0 sized. */ + for (int i = 0; i < num_vars; i++) { + if (this->virtual_grf_sizes[i] != 1) + split_grf[i] = true; + else + split_grf[i] = false; + } + + if (brw->has_pln) { + /* PLN opcodes rely on the delta_xy being contiguous. */ + split_grf[this->delta_x.reg] = false; + } + + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + /* Texturing produces 4 contiguous registers, so no splitting. */ + if ((inst->opcode == FS_OPCODE_TEX || + inst->opcode == FS_OPCODE_TXB || + inst->opcode == FS_OPCODE_TXL) && + inst->dst.file == GRF) { + split_grf[inst->dst.reg] = false; + } + } + + /* Allocate new space for split regs. Note that the virtual + * numbers will be contiguous. + */ + for (int i = 0; i < num_vars; i++) { + if (split_grf[i]) { + new_virtual_grf[i] = virtual_grf_alloc(1); + for (int j = 2; j < this->virtual_grf_sizes[i]; j++) { + int reg = virtual_grf_alloc(1); + assert(reg == new_virtual_grf[i] + j - 1); + } + this->virtual_grf_sizes[i] = 1; + } + } + + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + if (inst->dst.file == GRF && + split_grf[inst->dst.reg] && + inst->dst.reg_offset != 0) { + inst->dst.reg = (new_virtual_grf[inst->dst.reg] + + inst->dst.reg_offset - 1); + inst->dst.reg_offset = 0; + } + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == GRF && + split_grf[inst->src[i].reg] && + inst->src[i].reg_offset != 0) { + inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + + inst->src[i].reg_offset - 1); + inst->src[i].reg_offset = 0; + } + } + } +} + void fs_visitor::calculate_live_intervals() { @@ -3054,13 +3141,15 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) } v.emit_fb_writes(); + + v.split_virtual_grfs(); + v.assign_curb_setup(); v.assign_urb_setup(); bool progress; do { progress = false; - v.calculate_live_intervals(); progress = v.propagate_constants() || progress; progress = v.register_coalesce() || progress; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 929ac682b08..d0e84da1aad 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -213,6 +213,9 @@ public: init(); this->opcode = opcode; this->dst = dst; + + if (dst.file == GRF) + assert(dst.reg_offset >= 0); } fs_inst(int opcode, fs_reg dst, fs_reg src0) @@ -221,6 +224,11 @@ public: this->opcode = opcode; this->dst = dst; this->src[0] = src0; + + if (dst.file == GRF) + assert(dst.reg_offset >= 0); + if (src[0].file == GRF) + assert(src[0].reg_offset >= 0); } fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1) @@ -230,6 +238,13 @@ public: this->dst = dst; this->src[0] = src0; this->src[1] = src1; + + if (dst.file == GRF) + assert(dst.reg_offset >= 0); + if (src[0].file == GRF) + assert(src[0].reg_offset >= 0); + if (src[1].file == GRF) + assert(src[1].reg_offset >= 0); } fs_inst(int opcode, fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) @@ -240,6 +255,15 @@ public: this->src[0] = src0; this->src[1] = src1; this->src[2] = src2; + + if (dst.file == GRF) + assert(dst.reg_offset >= 0); + if (src[0].file == GRF) + assert(src[0].reg_offset >= 0); + if (src[1].file == GRF) + assert(src[1].reg_offset >= 0); + if (src[2].file == GRF) + assert(src[2].reg_offset >= 0); } int opcode; /* BRW_OPCODE_* or FS_OPCODE_* */ @@ -336,6 +360,7 @@ public: void assign_urb_setup(); void assign_regs(); void assign_regs_trivial(); + void split_virtual_grfs(); void calculate_live_intervals(); bool propagate_constants(); bool register_coalesce(); |