summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorIan Romanick <[email protected]>2015-01-23 17:31:12 -0800
committerIan Romanick <[email protected]>2015-03-19 10:21:08 -0700
commitb616164c95cb495ce43f6b61dc805ed911a85e89 (patch)
treea8c91e6551b6d01b0f714d69a8d400b01111999a /src
parent036e347f3c129bb547137aed955e75062fca09b8 (diff)
i965/fs: Emit better b2f of an expression on GEN4 and GEN5
On platforms that do not natively generate 0u and ~0u for Boolean results, b2f expressions that look like f = b2f(expr cmp 0) will generate better code by pretending the expression is f = ir_triop_sel(0.0, 1.0, expr cmp 0) This is because the last instruction of "expr" can generate the condition code for the "cmp 0". This avoids having to do the "-(b & 1)" trick to generate 0u or ~0u for the Boolean result. This means code like mov(16) g16<1>F 1F mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F (+f0) sel(16) m6<1>F g16<8,8,1>F 0F will be generated instead of mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F and(16) g4<1>D g2<8,8,1>D 1D and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD v2: When the comparison is either == 0.0 or != 0.0 use the knowledge that the true (or false) case already results in zero would allow better code generation by possibly avoiding a load-immediate instruction. v3: Apply the optimization even when neither comparitor is zero. Shader-db results: GM45 (0x2A42): total instructions in shared programs: 3551002 -> 3550829 (-0.00%) instructions in affected programs: 33269 -> 33096 (-0.52%) helped: 121 Iron Lake (0x0046): total instructions in shared programs: 4993327 -> 4993146 (-0.00%) instructions in affected programs: 34199 -> 34018 (-0.53%) helped: 129 No change on other platforms. Signed-off-by: Ian Romanick <[email protected]> Reviewed-by: Tapani Palli <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h2
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp101
2 files changed, 99 insertions, 4 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 77165297d0a..23e71353992 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -307,6 +307,7 @@ public:
const fs_reg &a);
void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
const fs_reg &src0, const fs_reg &src1);
+ bool try_emit_b2f_of_comparison(ir_expression *ir);
bool try_emit_saturate(ir_expression *ir);
bool try_emit_line(ir_expression *ir);
bool try_emit_mad(ir_expression *ir);
@@ -317,6 +318,7 @@ public:
bool opt_saturate_propagation();
bool opt_cmod_propagation();
void emit_bool_to_cond_code(ir_rvalue *condition);
+ void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
void emit_if_gen6(ir_if *ir);
void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
uint32_t spill_offset, int count);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 6d56115a443..0d5252ab08e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -475,6 +475,87 @@ fs_visitor::try_emit_mad(ir_expression *ir)
return true;
}
+bool
+fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
+{
+ /* On platforms that do not natively generate 0u and ~0u for Boolean
+ * results, b2f expressions that look like
+ *
+ * f = b2f(expr cmp 0)
+ *
+ * will generate better code by pretending the expression is
+ *
+ * f = ir_triop_csel(0.0, 1.0, expr cmp 0)
+ *
+ * This is because the last instruction of "expr" can generate the
+ * condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
+ * trick to generate 0u or ~0u for the Boolean result. This means code like
+ *
+ * mov(16) g16<1>F 1F
+ * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
+ * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
+ *
+ * will be generated instead of
+ *
+ * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
+ * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
+ * and(16) g4<1>D g2<8,8,1>D 1D
+ * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
+ *
+ * When the comparison is either == 0.0 or != 0.0 using the knowledge that
+ * the true (or false) case already results in zero would allow better code
+ * generation by possibly avoiding a load-immediate instruction.
+ */
+ ir_expression *cmp = ir->operands[0]->as_expression();
+ if (cmp == NULL)
+ return false;
+
+ if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
+ for (unsigned i = 0; i < 2; i++) {
+ ir_constant *c = cmp->operands[i]->as_constant();
+ if (c == NULL || !c->is_zero())
+ continue;
+
+ ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
+ if (expr != NULL) {
+ fs_reg op[2];
+
+ for (unsigned j = 0; j < 2; j++) {
+ cmp->operands[j]->accept(this);
+ op[j] = this->result;
+
+ resolve_ud_negate(&op[j]);
+ }
+
+ emit_bool_to_cond_code_of_reg(cmp, op);
+
+ /* In this case we know when the condition is true, op[i ^ 1]
+ * contains zero. Invert the predicate, use op[i ^ 1] as src0,
+ * and immediate 1.0f as src1.
+ */
+ this->result = vgrf(ir->type);
+ op[i ^ 1].type = BRW_REGISTER_TYPE_F;
+
+ fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+ inst->predicate_inverse = cmp->operation == ir_binop_equal;
+ return true;
+ }
+ }
+ }
+
+ emit_bool_to_cond_code(cmp);
+
+ fs_reg temp = vgrf(ir->type);
+ emit(MOV(temp, fs_reg(1.0f)));
+
+ this->result = vgrf(ir->type);
+ fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
+ inst->predicate = BRW_PREDICATE_NORMAL;
+
+ return true;
+}
+
static int
pack_pixel_offset(float x)
{
@@ -639,6 +720,11 @@ fs_visitor::visit(ir_expression *ir)
inst->predicate = BRW_PREDICATE_NORMAL;
return;
+ case ir_unop_b2f:
+ if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
+ return;
+ break;
+
case ir_unop_interpolate_at_centroid:
case ir_binop_interpolate_at_offset:
case ir_binop_interpolate_at_sample:
@@ -2508,7 +2594,6 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
}
fs_reg op[3];
- fs_inst *inst;
assert(expr->get_num_operands() <= 3);
for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
@@ -2520,6 +2605,14 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
resolve_ud_negate(&op[i]);
}
+ emit_bool_to_cond_code_of_reg(expr, op);
+}
+
+void
+fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
+{
+ fs_inst *inst;
+
switch (expr->operation) {
case ir_unop_logic_not:
inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
@@ -2528,7 +2621,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
case ir_binop_logic_xor:
if (brw->gen <= 5) {
- fs_reg temp = vgrf(ir->type);
+ fs_reg temp = vgrf(expr->type);
emit(XOR(temp, op[0], op[1]));
inst = emit(AND(reg_null_d, temp, fs_reg(1)));
} else {
@@ -2539,7 +2632,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
case ir_binop_logic_or:
if (brw->gen <= 5) {
- fs_reg temp = vgrf(ir->type);
+ fs_reg temp = vgrf(expr->type);
emit(OR(temp, op[0], op[1]));
inst = emit(AND(reg_null_d, temp, fs_reg(1)));
} else {
@@ -2550,7 +2643,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
case ir_binop_logic_and:
if (brw->gen <= 5) {
- fs_reg temp = vgrf(ir->type);
+ fs_reg temp = vgrf(expr->type);
emit(AND(temp, op[0], op[1]));
inst = emit(AND(reg_null_d, temp, fs_reg(1)));
} else {