aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIan Romanick <[email protected]>2018-05-22 18:19:16 -0700
committerIan Romanick <[email protected]>2019-03-28 15:35:53 -0700
commit2cf59861a8128a91bfdd6fe62bf69cb4593373e3 (patch)
tree33daec3329c0ccb028e108649ffbdc6dc5881879
parentc6ee46a7532291fc8583400e174e77b1833daf23 (diff)
nir: Add partial redundancy elimination for compares
This pass attempts to dectect code sequences like if (x < y) { z = y - x; ... } and replace them with sequences like t = x - y; if (t < 0) { z = -t; ... } On architectures where the subtract can generate the flags used by the if-statement, this saves an instruction. It's also possible that moving an instruction out of the if-statement will allow nir_opt_peephole_select to convert the whole thing to a bcsel. Currently only floating point compares and adds are supported. Adding support for integer will be a challenge due to integer overflow. There are a couple possible solutions, but they may not apply to all architectures. v2: Fix a typo in the commit message and a couple typos in comments. Fix possible NULL pointer deref from result of push_block(). Add missing (-A + B) case. Suggested by Caio. v3: Fix is_not_const_zero to work correctly with types other than nir_type_float32. Suggested by Ken. v4: Add some comments explaining how this works. Suggested by Ken. Reviewed-by: Kenneth Graunke <[email protected]>
-rw-r--r--src/compiler/Makefile.sources1
-rw-r--r--src/compiler/nir/meson.build1
-rw-r--r--src/compiler/nir/nir.h2
-rw-r--r--src/compiler/nir/nir_opt_comparison_pre.c383
-rw-r--r--src/compiler/nir/nir_search_helpers.h27
5 files changed, 414 insertions, 0 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index e542d86a37a..5fddb6d4db2 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -278,6 +278,7 @@ NIR_FILES = \
nir/nir_move_vec_src_uses_to_dest.c \
nir/nir_normalize_cubemap_coords.c \
nir/nir_opt_combine_stores.c \
+ nir/nir_opt_comparison_pre.c \
nir/nir_opt_conditional_discard.c \
nir/nir_opt_constant_folding.c \
nir/nir_opt_copy_prop_vars.c \
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 0900f56648e..c65f2ff62ff 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -160,6 +160,7 @@ files_libnir = files(
'nir_move_vec_src_uses_to_dest.c',
'nir_normalize_cubemap_coords.c',
'nir_opt_combine_stores.c',
+ 'nir_opt_comparison_pre.c',
'nir_opt_conditional_discard.c',
'nir_opt_constant_folding.c',
'nir_opt_copy_prop_vars.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 3ddf97bb12c..806e47dd7bb 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3401,6 +3401,8 @@ bool nir_lower_phis_to_regs_block(nir_block *block);
bool nir_lower_ssa_defs_to_regs_block(nir_block *block);
bool nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl);
+bool nir_opt_comparison_pre(nir_shader *shader);
+
bool nir_opt_algebraic(nir_shader *shader);
bool nir_opt_algebraic_before_ffma(nir_shader *shader);
bool nir_opt_algebraic_late(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_comparison_pre.c b/src/compiler/nir/nir_opt_comparison_pre.c
new file mode 100644
index 00000000000..ab31a2bf554
--- /dev/null
+++ b/src/compiler/nir/nir_opt_comparison_pre.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_instr_set.h"
+#include "nir_search_helpers.h"
+#include "nir_builder.h"
+#include "util/u_vector.h"
+
+/* Partial redundancy elimination of compares
+ *
+ * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
+ * instructions like 'b - a'. The comparison is replaced by the arithmetic
+ * instruction, and the result is compared with zero. For example,
+ *
+ * vec1 32 ssa_111 = flt 0.37, ssa_110.w
+ * if ssa_111 {
+ * block block_1:
+ * vec1 32 ssa_112 = fadd ssa_110.w, -0.37
+ * ...
+ *
+ * becomes
+ *
+ * vec1 32 ssa_111 = fadd ssa_110.w, -0.37
+ * vec1 32 ssa_112 = flt 0.0, ssa_111
+ * if ssa_112 {
+ * block block_1:
+ * ...
+ */
+
+struct block_queue {
+ /**
+ * Stack of blocks from the current location in the CFG to the entry point
+ * of the function.
+ *
+ * This is sort of a poor man's dominator tree.
+ */
+ struct exec_list blocks;
+
+ /** List of freed block_instructions structures that can be reused. */
+ struct exec_list reusable_blocks;
+};
+
+struct block_instructions {
+ struct exec_node node;
+
+ /**
+ * Set of comparison instructions from the block that are candidates for
+ * being replaced by add instructions.
+ */
+ struct u_vector instructions;
+};
+
+static void
+block_queue_init(struct block_queue *bq)
+{
+ exec_list_make_empty(&bq->blocks);
+ exec_list_make_empty(&bq->reusable_blocks);
+}
+
+static void
+block_queue_finish(struct block_queue *bq)
+{
+ struct block_instructions *n;
+
+ while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
+ u_vector_finish(&n->instructions);
+ free(n);
+ }
+
+ while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
+ free(n);
+ }
+}
+
+static struct block_instructions *
+push_block(struct block_queue *bq)
+{
+ struct block_instructions *bi =
+ (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
+
+ if (bi == NULL) {
+ bi = calloc(1, sizeof(struct block_instructions));
+
+ if (bi == NULL)
+ return NULL;
+ }
+
+ if (!u_vector_init(&bi->instructions,
+ sizeof(struct nir_alu_instr *),
+ 8 * sizeof(struct nir_alu_instr *)))
+ return NULL;
+
+ exec_list_push_tail(&bq->blocks, &bi->node);
+
+ return bi;
+}
+
+static void
+pop_block(struct block_queue *bq, struct block_instructions *bi)
+{
+ u_vector_finish(&bi->instructions);
+ exec_node_remove(&bi->node);
+ exec_list_push_head(&bq->reusable_blocks, &bi->node);
+}
+
+static void
+add_instruction_for_block(struct block_instructions *bi,
+ struct nir_alu_instr *alu)
+{
+ struct nir_alu_instr **data =
+ u_vector_add(&bi->instructions);
+
+ *data = alu;
+}
+
+static void
+rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
+ nir_alu_instr *orig_add, bool zero_on_left)
+{
+ void *const mem_ctx = ralloc_parent(orig_cmp);
+
+ bld->cursor = nir_before_instr(&orig_cmp->instr);
+
+ /* This is somewhat tricky. The compare instruction may be something like
+ * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
+ * b). This is problematic because the SSA value for the fneg(a) may not
+ * exist yet at the compare instruction.
+ *
+ * We fabricate the operands of the new add. This is done using
+ * information provided by zero_on_left. If zero_on_left is true, we know
+ * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the
+ * original compare instruction was (fcmp, a, b), x = b and y = -a. If
+ * zero_on_left is false, the resulting compare instruction is (fcmp,
+ * (fadd, x, y), 0.0) and x = a and y = -b.
+ */
+ nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
+ nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
+
+ nir_ssa_def *const fadd = zero_on_left
+ ? nir_fadd(bld, b, nir_fneg(bld, a))
+ : nir_fadd(bld, a, nir_fneg(bld, b));
+
+ nir_ssa_def *const zero =
+ nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
+
+ nir_ssa_def *const cmp = zero_on_left
+ ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
+ : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
+
+ /* Generating extra moves of the results is the easy way to make sure the
+ * writemasks match the original instructions. Later optimization passes
+ * will clean these up. This is similar to nir_replace_instr (in
+ * nir_search.c).
+ */
+ nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_imov);
+ mov_add->dest.write_mask = orig_add->dest.write_mask;
+ nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
+ orig_add->dest.dest.ssa.num_components,
+ orig_add->dest.dest.ssa.bit_size, NULL);
+ mov_add->src[0].src = nir_src_for_ssa(fadd);
+
+ nir_builder_instr_insert(bld, &mov_add->instr);
+
+ nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_imov);
+ mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
+ nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
+ orig_cmp->dest.dest.ssa.num_components,
+ orig_cmp->dest.dest.ssa.bit_size, NULL);
+ mov_cmp->src[0].src = nir_src_for_ssa(cmp);
+
+ nir_builder_instr_insert(bld, &mov_cmp->instr);
+
+ nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
+ nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
+ nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
+ nir_src_for_ssa(&mov_add->dest.dest.ssa));
+
+ /* We know these have no more uses because we just rewrote them all, so we
+ * can remove them.
+ */
+ nir_instr_remove(&orig_cmp->instr);
+ nir_instr_remove(&orig_add->instr);
+}
+
+static bool
+comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
+{
+ bool progress = false;
+
+ struct block_instructions *bi = push_block(bq);
+ if (bi == NULL)
+ return false;
+
+ /* Starting with the current block, examine each instruction. If the
+ * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
+ * to the block_instructions::instructions set. If the instruction is an
+ * add instruction, walk up the block queue looking at the stored
+ * instructions. If a matching comparison is found, move the addition and
+ * replace the comparison with a different comparison based on the result
+ * of the addition. All of the blocks in the queue are guaranteed to be
+ * dominators of the current block.
+ *
+ * After processing the current block, recurse into the blocks dominated by
+ * the current block.
+ */
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_alu)
+ continue;
+
+ struct nir_alu_instr *const alu = nir_instr_as_alu(instr);
+
+ if (alu->dest.dest.ssa.num_components != 1)
+ continue;
+
+ if (alu->dest.saturate)
+ continue;
+
+ static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
+
+ switch (alu->op) {
+ case nir_op_fadd: {
+ /* If the instruction is fadd, check it against comparison
+ * instructions that dominate it.
+ */
+ struct block_instructions *b =
+ (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
+
+ while (b->node.next != NULL) {
+ nir_alu_instr **a;
+ bool rewrote_compare = false;
+
+ u_vector_foreach(a, &b->instructions) {
+ nir_alu_instr *const cmp = *a;
+
+ if (cmp == NULL)
+ continue;
+
+ /* The operands of both instructions are, with some liberty,
+ * commutative. Check all four permutations. The third and
+ * fourth permutations are negations of the first two.
+ */
+ if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
+ nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
+ (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
+ nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
+ /* These are the cases where (A cmp B) matches either (A +
+ * -B) or (-B + A)
+ *
+ * A cmp B <=> A + -B cmp 0
+ */
+ rewrite_compare_instruction(bld, cmp, alu, false);
+
+ *a = NULL;
+ rewrote_compare = true;
+ break;
+ } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
+ nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
+ (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
+ nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
+ /* This is the case where (A cmp B) matches (B + -A) or (-A
+ * + B).
+ *
+ * A cmp B <=> 0 cmp B + -A
+ */
+ rewrite_compare_instruction(bld, cmp, alu, true);
+
+ *a = NULL;
+ rewrote_compare = true;
+ break;
+ }
+ }
+
+ /* Bail after a compare in the most dominating block is found.
+ * This is necessary because 'alu' has been removed from the
+ * instruction stream. Should there be a matching compare in
+ * another block, calling rewrite_compare_instruction again will
+ * try to operate on a node that is not in the list as if it were
+ * in the list.
+ *
+ * FINISHME: There may be opportunity for additional optimization
+ * here. I discovered this problem due to a shader in Guacamelee.
+ * It may be possible to rewrite the matching compares that are
+ * encountered later to reuse the result from the compare that was
+ * first rewritten. It's also possible that this is just taken
+ * care of by calling the optimization pass repeatedly.
+ */
+ if (rewrote_compare) {
+ progress = true;
+ break;
+ }
+
+ b = (struct block_instructions *) b->node.next;
+ }
+
+ break;
+ }
+
+ case nir_op_flt:
+ case nir_op_fge:
+ case nir_op_fne:
+ case nir_op_feq:
+ /* If the instruction is a comparison that is used by an if-statement
+ * and neither operand is immediate value 0, add it to the set.
+ */
+ if (is_used_by_if(alu) &&
+ is_not_const_zero(alu, 0, 1, swizzle) &&
+ is_not_const_zero(alu, 1, 1, swizzle))
+ add_instruction_for_block(bi, alu);
+
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ for (unsigned i = 0; i < block->num_dom_children; i++) {
+ nir_block *child = block->dom_children[i];
+
+ if (comparison_pre_block(child, bq, bld))
+ progress = true;
+ }
+
+ pop_block(bq, bi);
+
+ return progress;
+}
+
+static bool
+nir_opt_comparison_pre_impl(nir_function_impl *impl)
+{
+ struct block_queue bq;
+ nir_builder bld;
+
+ block_queue_init(&bq);
+ nir_builder_init(&bld, impl);
+
+ nir_metadata_require(impl, nir_metadata_dominance);
+
+ const bool progress =
+ comparison_pre_block(nir_start_block(impl), &bq, &bld);
+
+ block_queue_finish(&bq);
+
+ if (progress)
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+
+ return progress;
+}
+
+bool
+nir_opt_comparison_pre(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= nir_opt_comparison_pre_impl(function->impl);
+ }
+
+ return progress;
+}
diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
index 456de81e175..1624508993d 100644
--- a/src/compiler/nir/nir_search_helpers.h
+++ b/src/compiler/nir/nir_search_helpers.h
@@ -110,6 +110,33 @@ is_zero_to_one(nir_alu_instr *instr, unsigned src, unsigned num_components,
}
static inline bool
+is_not_const_zero(nir_alu_instr *instr, unsigned src, unsigned num_components,
+ const uint8_t *swizzle)
+{
+ if (nir_src_as_const_value(instr->src[src].src) == NULL)
+ return true;
+
+ for (unsigned i = 0; i < num_components; i++) {
+ switch (nir_op_infos[instr->op].input_types[src]) {
+ case nir_type_float:
+ if (nir_src_comp_as_float(instr->src[src].src, swizzle[i]) == 0.0)
+ return false;
+ break;
+ case nir_type_bool:
+ case nir_type_int:
+ case nir_type_uint:
+ if (nir_src_comp_as_uint(instr->src[src].src, swizzle[i]) == 0)
+ return false;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static inline bool
is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned num_components,
UNUSED const uint8_t *swizzle)
{