diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 90 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 11 |
3 files changed, 103 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f04fb59f69f..7cc88eade45 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2555,6 +2555,94 @@ fs_visitor::opt_algebraic() return progress; } +/** + * Optimize sample messages which are followed by the final RT write. + * + * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its + * results sent directly to the framebuffer, bypassing the EU. Recognize the + * final texturing results copied to the framebuffer write payload and modify + * them to write to the framebuffer directly. + */ +bool +fs_visitor::opt_sampler_eot() +{ + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + if (brw->gen < 9 && !brw->is_cherryview) + return false; + + /* FINISHME: It should be possible to implement this optimization when there + * are multiple drawbuffers. + */ + if (key->nr_color_regions != 1) + return false; + + /* Look for a texturing instruction immediately before the final FB_WRITE. */ + fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end(); + assert(fb_write->eot); + assert(fb_write->opcode == FS_OPCODE_FB_WRITE); + + fs_inst *tex_inst = (fs_inst *) fb_write->prev; + + /* There wasn't one; nothing to do. */ + if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex()) + return false; + + /* If there's no header present, we need to munge the LOAD_PAYLOAD as well. + * It's very likely to be the previous instruction. + */ + fs_inst *load_payload = (fs_inst *) tex_inst->prev; + if (load_payload->is_head_sentinel() || + load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + return false; + + assert(!tex_inst->eot); /* We can't get here twice */ + assert((tex_inst->offset & (0xff << 24)) == 0); + + tex_inst->offset |= fb_write->target << 24; + tex_inst->eot = true; + fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); + + /* If a header is present, marking the eot is sufficient. Otherwise, we need + * to create a new LOAD_PAYLOAD command with the same sources and a space + * saved for the header. Using a new destination register not only makes sure + * we have enough space, but it will make sure the dead code eliminator kills + * the instruction that this will replace. + */ + if (tex_inst->header_present) + return true; + + fs_reg send_header = vgrf(load_payload->sources + 1); + fs_reg *new_sources = + ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1); + + new_sources[0] = fs_reg(); + for (int i = 0; i < load_payload->sources; i++) + new_sources[i+1] = load_payload->src[i]; + + /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it + * requires a lot of information about the sources to appropriately figure + * out the number of registers needed to be used. Given this stage in our + * optimization, we may not have the appropriate GRFs required by + * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to + * manually emit the instruction. + */ + fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, + load_payload->exec_size, + send_header, + new_sources, + load_payload->sources + 1); + + new_load_payload->regs_written = load_payload->regs_written + 1; + tex_inst->mlen++; + tex_inst->header_present = true; + tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload); + tex_inst->src[0] = send_header; + tex_inst->dst = reg_null_ud; + + return true; +} + bool fs_visitor::opt_register_renaming() { @@ -3761,6 +3849,8 @@ fs_visitor::optimize() pass_num = 0; + OPT(opt_sampler_eot); + if (OPT(lower_load_payload)) { split_virtual_grfs(); OPT(register_coalesce); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index cfdbf555d62..32063f01b8c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -231,6 +231,8 @@ public: bool compute_to_mrf(); bool dead_code_eliminate(); bool remove_duplicate_mrf_writes(); + + bool opt_sampler_eot(); bool virtual_grf_interferes(int a, int b); void schedule_instructions(instruction_scheduler_mode mode); void insert_gen4_send_dependency_workarounds(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 7c000206d09..b06a947519c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -517,6 +517,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src int rlen = 4; uint32_t simd_mode; uint32_t return_format; + bool is_combined_send = inst->eot; switch (dst.type) { case BRW_REGISTER_TYPE_D: @@ -688,6 +689,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src dst = vec16(dst); } + if (is_combined_send) { + assert(brw->gen >= 9 || brw->is_cherryview); + rlen = 0; + } + assert(brw->gen < 7 || !inst->header_present || src.file == BRW_GENERAL_REGISTER_FILE); @@ -793,6 +799,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * so has already done marking. */ } + + if (is_combined_send) { + brw_inst_set_eot(brw, brw_last_inst, true); + brw_inst_set_opcode(brw, brw_last_inst, BRW_OPCODE_SENDC); + } } |