From 11674dad8acef294bc920e7f02ef45185420fbce Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Mon, 18 Dec 2017 15:22:04 -0800 Subject: intel/fs: Optimize and simplify the copy propagation dataflow logic. Previously the dataflow propagation algorithm would calculate the ACP live-in and -out sets in a two-pass fixed-point algorithm. The first pass would update the live-out sets of all basic blocks of the program based on their live-in sets, while the second pass would update the live-in sets based on the live-out sets. This is incredibly inefficient in the typical case where the CFG of the program is approximately acyclic, because it can take up to 2*n passes for an ACP entry introduced at the top of the program to reach the bottom (where n is the number of basic blocks in the program), until which point the algorithm won't be able to reach a fixed point. The same effect can be achieved in a single pass by computing the live-in and -out sets in lock-step, because that makes sure that processing of any basic block will pick up the updated live-out sets of the lexically preceding blocks. This gives the dataflow propagation algorithm effectively O(n) run-time instead of O(n^2) in the acyclic case. The time spent in dataflow propagation is reduced by 30x in the GLES31.functional.ssbo.layout.random.all_shared_buffer.5 dEQP test-case on my CHV system (the improvement is likely to be of the same order of magnitude on other platforms). This more than reverses an apparent run-time regression in this test-case from my previous copy-propagation undefined-value handling patch, which was ultimately caused by the additional work introduced in that commit to account for undefined values being multiplied by a huge quadratic factor. According to Chad this test was failing on CHV due to a 30s time-out imposed by the Android CTS (this was the case regardless of my undefined-value handling patch, even though my patch substantially exacerbated the issue). On my CHV system this patch reduces the overall run-time of the test by approximately 12x, getting us to around 13s, well below the time-out. v2: Initialize live-out set to the universal set to avoid rather pessimistic dataflow estimation in shaders with cycles (Addresses performance regression reported by Eero in GpuTest Piano). Performance numbers given above still apply. No shader-db changes with respect to master. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104271 Reported-by: Chad Versace Reviewed-by: Ian Romanick --- src/intel/compiler/brw_fs_copy_propagation.cpp | 35 ++++++++------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'src/intel') diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp index af5635eacef..92cc0a8de58 100644 --- a/src/intel/compiler/brw_fs_copy_propagation.cpp +++ b/src/intel/compiler/brw_fs_copy_propagation.cpp @@ -186,8 +186,7 @@ fs_copy_prop_dataflow::setup_initial_values() /* Populate the initial values for the livein and liveout sets. For the * block at the start of the program, livein = 0 and liveout = copy. - * For the others, set liveout to 0 (the empty set) and livein to ~0 - * (the universal set). + * For the others, set liveout and livein to ~0 (the universal set). */ foreach_block (block, cfg) { if (block->parents.is_empty()) { @@ -197,7 +196,7 @@ fs_copy_prop_dataflow::setup_initial_values() } } else { for (int i = 0; i < bitset_words; i++) { - bd[block->num].liveout[i] = 0u; + bd[block->num].liveout[i] = ~0u; bd[block->num].livein[i] = ~0u; } } @@ -228,34 +227,17 @@ fs_copy_prop_dataflow::run() do { progress = false; - /* Update liveout for all blocks. */ foreach_block (block, cfg) { if (block->parents.is_empty()) continue; for (int i = 0; i < bitset_words; i++) { const BITSET_WORD old_liveout = bd[block->num].liveout[i]; - - bd[block->num].liveout[i] = - bd[block->num].copy[i] | (bd[block->num].livein[i] & - ~bd[block->num].kill[i]); - - if (old_liveout != bd[block->num].liveout[i]) - progress = true; - } - } - - /* Update livein for all blocks. If a copy is live out of all parent - * blocks, it's live coming in to this block. - */ - foreach_block (block, cfg) { - if (block->parents.is_empty()) - continue; - - for (int i = 0; i < bitset_words; i++) { - const BITSET_WORD old_livein = bd[block->num].livein[i]; BITSET_WORD livein_from_any_block = 0; + /* Update livein for this block. If a copy is live out of all + * parent blocks, it's live coming in to this block. + */ bd[block->num].livein[i] = ~0u; foreach_list_typed(bblock_link, parent_link, link, &block->parents) { bblock_t *parent = parent_link->block; @@ -278,7 +260,12 @@ fs_copy_prop_dataflow::run() */ bd[block->num].livein[i] &= livein_from_any_block; - if (old_livein != bd[block->num].livein[i]) + /* Update liveout for this block. */ + bd[block->num].liveout[i] = + bd[block->num].copy[i] | (bd[block->num].livein[i] & + ~bd[block->num].kill[i]); + + if (old_liveout != bd[block->num].liveout[i]) progress = true; } } -- cgit v1.2.3