diff options
author | Eric Anholt <[email protected]> | 2013-02-15 19:26:48 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2013-02-19 10:33:32 -0800 |
commit | aebd3f46e305829ebfcc817cafa8592edc2f80ab (patch) | |
tree | 4f302b1e1cb26b1c663c243e4f34be495964b590 /src | |
parent | 49bdebad3857bb9ebac53f593d08f0057f5a20d3 (diff) |
i965/fs: Delay setup of uniform loads until after pre-regalloc scheduling.
This should fix the register allocation explosion on the GLES 3.0 test
on gen6. It also gives us an instruction that will fit our CSE handling.
Reviewed-by: Kenneth Graunke <[email protected]>
NOTE: This is a candidate for the 9.1 branch.
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 64 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 28 |
3 files changed, 66 insertions, 27 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 35cdc6a02e4..f3232b29271 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants() dst, index, offset); pull->ir = inst->ir; pull->annotation = inst->annotation; - pull->base_mrf = 14; - pull->mlen = 1; inst->insert_before(pull); @@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds() } } +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + if (intel->gen >= 7) { + fs_reg const_offset_reg = inst->src[1]; + assert(const_offset_reg.file == IMM && + const_offset_reg.type == BRW_REGISTER_TYPE_UD); + const_offset_reg.imm.u /= 16; + fs_reg payload = fs_reg(this, glsl_type::uint_type); + struct brw_reg g0 = retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UD); + + fs_inst *setup1 = MOV(payload, fs_reg(g0)); + setup1->force_writemask_all = true; + /* We don't need the second half of this vgrf to be filled with g1 + * in the 16-wide case, but if we use force_uncompressed then live + * variable analysis won't consider this a def! + */ + + fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET, + payload, payload, + const_offset_reg); + + setup1->ir = inst->ir; + setup1->annotation = inst->annotation; + inst->insert_before(setup1); + setup2->ir = inst->ir; + setup2->annotation = inst->annotation; + inst->insert_before(setup2); + inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; + inst->src[1] = payload; + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = 14; + inst->mlen = 1; + } + } +} + void fs_visitor::dump_instruction(fs_inst *inst) { @@ -2748,6 +2806,8 @@ fs_visitor::run() schedule_instructions(false); + lower_uniform_pull_constant_loads(); + assign_curb_setup(); assign_urb_setup(); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index d5ebd515cbb..d1bb111bf5f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -334,6 +334,7 @@ public: void insert_gen4_pre_send_dependency_workarounds(fs_inst *inst); void insert_gen4_post_send_dependency_workarounds(fs_inst *inst); void fail(const char *msg, ...); + void lower_uniform_pull_constant_loads(); void push_force_uncompressed(); void pop_force_uncompressed(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index d4f6fc9ca7e..573921cf8cc 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -597,31 +597,9 @@ fs_visitor::visit(ir_expression *ir) fs_reg packed_consts = fs_reg(this, glsl_type::float_type); packed_consts.type = result.type; - if (intel->gen >= 7) { - fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16); - fs_reg payload = fs_reg(this, glsl_type::uint_type); - struct brw_reg g0 = retype(brw_vec8_grf(0, 0), - BRW_REGISTER_TYPE_UD); - fs_inst *setup = emit(MOV(payload, fs_reg(g0))); - setup->force_writemask_all = true; - /* We don't need the second half of this vgrf to be filled with g1 - * in the 16-wide case, but if we use force_uncompressed then live - * variable analysis won't consider this a def! - */ - - emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload, - payload, const_offset_reg); - emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts, - surf_index, payload); - } else { - fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]); - fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - packed_consts, - surf_index, - const_offset_reg)); - pull->base_mrf = 14; - pull->mlen = 1; - } + fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15); + emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, + packed_consts, surf_index, const_offset_reg)); packed_consts.smear = const_offset->value.u[0] % 16 / 4; for (int i = 0; i < ir->type->vector_elements; i++) { |