diff options
author | Eric Anholt <[email protected]> | 2012-10-30 11:09:59 -0700 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2012-11-08 14:50:32 -0800 |
commit | 177c82555b24a80c15c34315ff17437cc39d1ba5 (patch) | |
tree | 39ecf2ef08bbe0725b9a7089a58f81c80f82bcbf /src/mesa/drivers | |
parent | 9785ae0973cc206afc36dbc7d5b9553f92d06b47 (diff) |
i965/fs: Add support for global copy propagation.
It is common for complicated shaders, particularly code-generated ones, to
have a big array of uniforms or attributes, and a prologue in the shader that
dereferences from the big array to more informatively-named local variables.
Then there will be some small control flow operation (like a ? : statement),
and then use of those informatively-named variables. We were emitting extra
MOVs in these cases, because copy propagation couldn't reach across control
flow.
Instead, implement dataflow analysis on the output of the first copy
propagation pass and re-run it to propagate those extra MOVs out.
On one future Steam release, reduces VS+FS instruction count from 42837 to
41437. No statistically significant performance difference (n=48), though, at
least at the low resolution I'm running it at.
shader-db results:
total instructions in shared programs: 722170 -> 702545 (-2.72%)
instructions in affected programs: 260618 -> 240993 (-7.53%)
Some shaders do get hurt by up to 2 instructions, because a choice to copy
propagate instead of coalesce or something like that results in a dead write
sticking around. Given that we already have instances of those instructions
in the affected programs (particularly unigine), we should just improve dead
code elimination to fix the problem.
Diffstat (limited to 'src/mesa/drivers')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 3 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 216 |
2 files changed, 199 insertions, 20 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 13662bb8836..48451767c32 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -249,7 +249,8 @@ public: bool opt_copy_propagate(); bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); bool try_constant_propagate(fs_inst *inst, acp_entry *entry); - bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block); + bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block, + exec_list *acp); bool register_coalesce(); bool register_coalesce_2(); bool compute_to_mrf(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index cc488a1ec70..dec3dcaef5b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -21,6 +21,19 @@ * IN THE SOFTWARE. */ +/** @file brw_fs_copy_propagation.cpp + * + * Support for global copy propagation in two passes: A local pass that does + * intra-block copy (and constant) propagation, and a global pass that uses + * dataflow analysis on the copies available at the end of each block to re-do + * local copy propagation with more copies available. + * + * See Muchnik's Advanced Compiler Design and Implementation, section + * 12.5 (p356). + */ + +#define ACP_HASH_SIZE 16 + #include "brw_fs.h" #include "brw_cfg.h" @@ -29,6 +42,158 @@ struct acp_entry : public exec_node { fs_reg dst; fs_reg src; }; + +struct block_data { + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the + * start of this block. This is the useful output of the analysis, since + * it lets us plug those into the local copy propagation on the second + * pass. + */ + bool *livein; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are live at the end + * of this block. This is done in initial setup from the per-block acps + * returned by the first local copy prop pass. + */ + bool *liveout; + + /** + * Which entries in the fs_copy_prop_dataflow acp table are killed over the + * course of this block. + */ + bool *kill; +}; + +class fs_copy_prop_dataflow +{ +public: + fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list out_acp[][ACP_HASH_SIZE]); + + void setup_kills(); + void run(); + + void *mem_ctx; + cfg_t *cfg; + + acp_entry **acp; + int num_acp; + + struct block_data *bd; +}; +} /* anonymous namespace */ + +fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, + exec_list out_acp[][ACP_HASH_SIZE]) + : mem_ctx(mem_ctx), cfg(cfg) +{ + bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); + + num_acp = 0; + for (int b = 0; b < cfg->num_blocks; b++) { + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_list(entry_node, &out_acp[b][i]) { + num_acp++; + } + } + } + + acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp); + + int next_acp = 0; + for (int b = 0; b < cfg->num_blocks; b++) { + bd[b].livein = rzalloc_array(bd, bool, num_acp); + bd[b].liveout = rzalloc_array(bd, bool, num_acp); + bd[b].kill = rzalloc_array(bd, bool, num_acp); + + for (int i = 0; i < ACP_HASH_SIZE; i++) { + foreach_list(entry_node, &out_acp[b][i]) { + acp_entry *entry = (acp_entry *)entry_node; + + acp[next_acp] = entry; + bd[b].liveout[next_acp] = true; + next_acp++; + } + } + } + + assert(next_acp == num_acp); + + setup_kills(); + run(); +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::setup_kills() +{ + for (int b = 0; b < cfg->num_blocks; b++) { + bblock_t *block = cfg->blocks[b]; + + for (fs_inst *inst = (fs_inst *)block->start; + inst != block->end->next; + inst = (fs_inst *)inst->next) { + if (inst->dst.file != GRF) + continue; + + for (int i = 0; i < num_acp; i++) { + if (inst->overwrites_reg(acp[i]->dst) || + inst->overwrites_reg(acp[i]->src)) { + bd[b].kill[i] = true; + } + } + } + } +} + +/** + * Walk the set of instructions in the block, marking which entries in the acp + * are killed by the block. + */ +void +fs_copy_prop_dataflow::run() +{ + bool cont = true; + + while (cont) { + cont = false; + + for (int b = 0; b < cfg->num_blocks; b++) { + for (int i = 0; i < num_acp; i++) { + if (!bd[b].liveout[i]) { + /* Update liveout */ + if (bd[b].livein[i] && !bd[b].kill[i]) { + bd[b].liveout[i] = true; + cont = true; + } + } + + if (!bd[b].livein[i]) { + /* Update livein: if it's live at the end of all parents, it's + * live at our start. + */ + bool add = true; + foreach_list(block_node, &cfg->blocks[b]->parents) { + bblock_link *link = (bblock_link *)block_node; + bblock_t *block = link->block; + if (!bd[block->block_num].liveout[i]) { + add = false; + break; + } + } + if (add) { + bd[b].livein[i] = true; + cont = true; + } + } + } + } + } } bool @@ -183,25 +348,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) return progress; } - -/** @file brw_fs_copy_propagation.cpp - * - * Support for local copy propagation by walking the list of instructions - * and maintaining the ACP table of available copies for propagation. - * - * See Muchnik's Advanced Compiler Design and Implementation, section - * 12.5 (p356). - */ - /* Walks a basic block and does copy propagation on it using the acp * list. */ bool -fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) +fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block, + exec_list *acp) { bool progress = false; - int acp_count = 16; - exec_list acp[acp_count]; for (fs_inst *inst = (fs_inst *)block->start; inst != block->end->next; @@ -212,7 +366,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) if (inst->src[i].file != GRF) continue; - foreach_list(entry_node, &acp[inst->src[i].reg % acp_count]) { + foreach_list(entry_node, &acp[inst->src[i].reg % ACP_HASH_SIZE]) { acp_entry *entry = (acp_entry *)entry_node; if (try_constant_propagate(inst, entry)) @@ -225,7 +379,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) /* kill the destination from the ACP */ if (inst->dst.file == GRF) { - foreach_list_safe(entry_node, &acp[inst->dst.reg % acp_count]) { + foreach_list_safe(entry_node, &acp[inst->dst.reg % ACP_HASH_SIZE]) { acp_entry *entry = (acp_entry *)entry_node; if (inst->overwrites_reg(entry->dst)) { @@ -236,7 +390,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) /* Oops, we only have the chaining hash based on the destination, not * the source, so walk across the entire table. */ - for (int i = 0; i < acp_count; i++) { + for (int i = 0; i < ACP_HASH_SIZE; i++) { foreach_list_safe(entry_node, &acp[i]) { acp_entry *entry = (acp_entry *)entry_node; if (inst->overwrites_reg(entry->src)) @@ -263,7 +417,7 @@ fs_visitor::opt_copy_propagate_local(void *mem_ctx, bblock_t *block) acp_entry *entry = ralloc(mem_ctx, acp_entry); entry->dst = inst->dst; entry->src = inst->src[0]; - acp[entry->dst.reg % acp_count].push_tail(entry); + acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); } } @@ -275,13 +429,37 @@ fs_visitor::opt_copy_propagate() { bool progress = false; void *mem_ctx = ralloc_context(this->mem_ctx); - cfg_t cfg(this); + exec_list out_acp[cfg.num_blocks][ACP_HASH_SIZE]; + /* First, walk through each block doing local copy propagation and getting + * the set of copies available at the end of the block. + */ for (int b = 0; b < cfg.num_blocks; b++) { bblock_t *block = cfg.blocks[b]; - progress = opt_copy_propagate_local(mem_ctx, block) || progress; + progress = opt_copy_propagate_local(mem_ctx, block, + out_acp[b]) || progress; + } + + /* Do dataflow analysis for those available copies. */ + fs_copy_prop_dataflow dataflow(mem_ctx, &cfg, out_acp); + + /* Next, re-run local copy propagation, this time with the set of copies + * provided by the dataflow analysis available at the start of a block. + */ + for (int b = 0; b < cfg.num_blocks; b++) { + bblock_t *block = cfg.blocks[b]; + exec_list in_acp[ACP_HASH_SIZE]; + + for (int i = 0; i < dataflow.num_acp; i++) { + if (dataflow.bd[b].livein[i]) { + struct acp_entry *entry = dataflow.acp[i]; + in_acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); + } + } + + progress = opt_copy_propagate_local(mem_ctx, block, in_acp) || progress; } ralloc_free(mem_ctx); |