diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/r600/r600_isa.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.c | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_pipe.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_bc.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 2 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_context.cpp | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_core.cpp | 1 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_expr.cpp | 448 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_expr.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_shader.cpp | 2 | ||||
-rw-r--r-- | src/gallium/drivers/r600/sb/sb_shader.h | 2 |
11 files changed, 435 insertions, 47 deletions
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h index 8cccc9d0d70..c6bb869fd10 100644 --- a/src/gallium/drivers/r600/r600_isa.h +++ b/src/gallium/drivers/r600/r600_isa.h @@ -84,7 +84,8 @@ enum alu_op_flags * includes MULADDs (considering the MUL part on src0 and src1 only) */ AF_M_COMM = (1 << 23), - /* associative operation ((a op b) op c) == (a op (b op c)) */ + /* associative operation ((a op b) op c) == (a op (b op c)), + * includes MULADDs (considering the MUL part on src0 and src1 only) */ AF_M_ASSOC = (1 << 24), AF_PRED_PUSH = (1 << 25), @@ -373,11 +374,11 @@ static const struct alu_op_info alu_op_table[] = { {"SAD_ACCUM_HI_UINT", 3, { -1, 0x0F },{ 0, 0, AF_V, AF_V}, AF_UINT_DST }, {"MULADD_UINT24", 3, { -1, 0x10 },{ 0, 0, AF_V, AF_V}, AF_UINT_DST | AF_24 }, {"LDS_IDX_OP", 3, { -1, 0x11 },{ 0, 0, AF_V, AF_V}, 0 }, - {"MULADD", 3, { 0x10, 0x14 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM }, - {"MULADD_M2", 3, { 0x11, 0x15 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM }, - {"MULADD_M4", 3, { 0x12, 0x16 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM }, - {"MULADD_D2", 3, { 0x13, 0x17 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM }, - {"MULADD_IEEE", 3, { 0x14, 0x18 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_IEEE }, + {"MULADD", 3, { 0x10, 0x14 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_M_ASSOC }, + {"MULADD_M2", 3, { 0x11, 0x15 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_M_ASSOC }, + {"MULADD_M4", 3, { 0x12, 0x16 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_M_ASSOC }, + {"MULADD_D2", 3, { 0x13, 0x17 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_M_ASSOC }, + {"MULADD_IEEE", 3, { 0x14, 0x18 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_M_COMM | AF_M_ASSOC | AF_IEEE }, {"CNDE", 3, { 0x18, 0x19 },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_CMOV | AF_CC_E }, {"CNDGT", 3, { 0x19, 0x1A },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_CMOV | AF_CC_GT }, {"CNDGE", 3, { 0x1A, 0x1B },{ AF_VS, AF_VS, AF_VS, AF_VS}, AF_CMOV | AF_CC_GE }, @@ -397,9 +398,9 @@ static const struct alu_op_info alu_op_table[] = { {"MUL_LIT_M2", 3, { 0x0D, -1 },{ AF_VS, AF_VS, 0, 0}, 0 }, {"MUL_LIT_M4", 3, { 0x0E, -1 },{ AF_VS, AF_VS, 0, 0}, 0 }, {"MUL_LIT_D2", 3, { 0x0F, -1 },{ AF_VS, AF_VS, 0, 0}, 0 }, - {"MULADD_IEEE_M2", 3, { 0x15, -1 },{ AF_VS, AF_VS, 0, 0}, AF_IEEE }, - {"MULADD_IEEE_M4", 3, { 0x16, -1 },{ AF_VS, AF_VS, 0, 0}, AF_IEEE }, - {"MULADD_IEEE_D2", 3, { 0x17, -1 },{ AF_VS, AF_VS, 0, 0}, AF_IEEE }, + {"MULADD_IEEE_M2", 3, { 0x15, -1 },{ AF_VS, AF_VS, 0, 0}, AF_M_COMM | AF_M_ASSOC | AF_IEEE }, + {"MULADD_IEEE_M4", 3, { 0x16, -1 },{ AF_VS, AF_VS, 0, 0}, AF_M_COMM | AF_M_ASSOC | AF_IEEE }, + {"MULADD_IEEE_D2", 3, { 0x17, -1 },{ AF_VS, AF_VS, 0, 0}, AF_M_COMM | AF_M_ASSOC | AF_IEEE }, {"LDS_ADD", 2, { -1, 0x0011 },{ 0, 0, AF_V, AF_V}, AF_LDS }, {"LDS_SUB", 2, { -1, 0x0111 },{ 0, 0, AF_V, AF_V}, AF_LDS }, diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index a4e88ce0d7a..49abf50487e 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -74,6 +74,7 @@ static const struct debug_named_value debug_options[] = { { "sbdump", DBG_SB_DUMP, "Print IR dumps after some optimization passes" }, { "sbnofallback", DBG_SB_NO_FALLBACK, "Abort on errors instead of fallback" }, { "sbdisasm", DBG_SB_DISASM, "Use sb disassembler for shader dumps" }, + { "sbsafemath", DBG_SB_SAFEMATH, "Disable unsafe math optimizations" }, DEBUG_NAMED_VALUE_END /* must be last */ }; diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 3fad311da2e..349a6cb6cfb 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -256,6 +256,7 @@ typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx, #define DBG_SB_DUMP (1 << 25) #define DBG_SB_NO_FALLBACK (1 << 26) #define DBG_SB_DISASM (1 << 27) +#define DBG_SB_SAFEMATH (1 << 28) struct r600_tiling_info { unsigned num_channels; diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index 25255a71d32..224301fc6a6 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -621,6 +621,7 @@ public: static unsigned dry_run; static unsigned no_fallback; + static unsigned safe_math; static unsigned dskip_start; static unsigned dskip_end; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index c2443b8ef6c..a7e712816b1 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -70,6 +70,8 @@ int bc_parser::decode() { } sh = new shader(ctx, t, bc->debug_id); + sh->safe_math = sb_context::safe_math || (t == TARGET_COMPUTE); + int r = decode_shader(); delete dec; diff --git a/src/gallium/drivers/r600/sb/sb_context.cpp b/src/gallium/drivers/r600/sb/sb_context.cpp index 9723a841c99..78b26d5cae8 100644 --- a/src/gallium/drivers/r600/sb/sb_context.cpp +++ b/src/gallium/drivers/r600/sb/sb_context.cpp @@ -34,6 +34,7 @@ unsigned sb_context::dump_pass = 0; unsigned sb_context::dump_stat = 0; unsigned sb_context::dry_run = 0; unsigned sb_context::no_fallback = 0; +unsigned sb_context::safe_math = 0; unsigned sb_context::dskip_start = 0; unsigned sb_context::dskip_end = 0; diff --git a/src/gallium/drivers/r600/sb/sb_core.cpp b/src/gallium/drivers/r600/sb/sb_core.cpp index 5b917ac6e75..d907508eb2a 100644 --- a/src/gallium/drivers/r600/sb/sb_core.cpp +++ b/src/gallium/drivers/r600/sb/sb_core.cpp @@ -63,6 +63,7 @@ sb_context *r600_sb_context_create(struct r600_context *rctx) { sb_context::dump_stat = df & DBG_SB_STAT; sb_context::dry_run = df & DBG_SB_DRY_RUN; sb_context::no_fallback = df & DBG_SB_NO_FALLBACK; + sb_context::safe_math = df & DBG_SB_SAFEMATH; sb_context::dskip_start = debug_get_num_option("R600_SB_DSKIP_START", 0); sb_context::dskip_end = debug_get_num_option("R600_SB_DSKIP_END", 0); diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 8337e77cd06..52c0c1796ba 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -388,6 +388,18 @@ bool expr_handler::fold_alu_op1(alu_node& n) { assert(v0 && n.dst[0]); if (!v0->is_const()) { + // handle (MOV -(MOV -x)) => (MOV x) + if (n.bc.op == ALU_OP1_MOV && n.bc.src[0].neg && !n.bc.src[1].abs + && v0->def && v0->def->is_alu_op(ALU_OP1_MOV)) { + alu_node *sd = static_cast<alu_node*>(v0->def); + if (!sd->bc.clamp && !sd->bc.omod && !sd->bc.src[0].abs && + sd->bc.src[0].neg) { + n.src[0] = sd->src[0]; + n.bc.src[0].neg = 0; + v0 = n.src[0]->gvalue(); + } + } + if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT || n.bc.op == ALU_OP1_MOVA_GPR_INT) && n.bc.clamp == 0 && n.bc.omod == 0 @@ -452,6 +464,260 @@ bool expr_handler::fold_alu_op1(alu_node& n) { return true; } +bool expr_handler::fold_mul_add(alu_node *n) { + + bool ieee; + value* v0 = n->src[0]->gvalue(); + + alu_node *d0 = (v0->def && v0->def->is_alu_inst()) ? + static_cast<alu_node*>(v0->def) : NULL; + + if (d0) { + if (d0->is_alu_op(ALU_OP2_MUL_IEEE)) + ieee = true; + else if (d0->is_alu_op(ALU_OP2_MUL)) + ieee = false; + else + return false; + + if (!d0->bc.src[0].abs && !d0->bc.src[1].abs && + !n->bc.src[1].abs && !n->bc.src[0].abs && !d0->bc.omod && + !d0->bc.clamp && !n->bc.omod && + (!d0->src[0]->is_kcache() || !d0->src[1]->is_kcache() || + !n->src[1]->is_kcache())) { + + bool mul_neg = n->bc.src[0].neg; + + n->src.resize(3); + n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD); + n->src[2] = n->src[1]; + n->bc.src[2] = n->bc.src[1]; + n->src[0] = d0->src[0]; + n->bc.src[0] = d0->bc.src[0]; + n->src[1] = d0->src[1]; + n->bc.src[1] = d0->bc.src[1]; + + n->bc.src[0].neg ^= mul_neg; + + fold_alu_op3(*n); + return true; + } + } + + value* v1 = n->src[1]->gvalue(); + + alu_node *d1 = (v1->def && v1->def->is_alu_inst()) ? + static_cast<alu_node*>(v1->def) : NULL; + + if (d1) { + if (d1->is_alu_op(ALU_OP2_MUL_IEEE)) + ieee = true; + else if (d1->is_alu_op(ALU_OP2_MUL)) + ieee = false; + else + return false; + + if (!d1->bc.src[1].abs && !d1->bc.src[0].abs && + !n->bc.src[0].abs && !n->bc.src[1].abs && !d1->bc.omod && + !d1->bc.clamp && !n->bc.omod && + (!d1->src[0]->is_kcache() || !d1->src[1]->is_kcache() || + !n->src[0]->is_kcache())) { + + bool mul_neg = n->bc.src[1].neg; + + n->src.resize(3); + n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD); + n->src[2] = n->src[0]; + n->bc.src[2] = n->bc.src[0]; + n->src[1] = d1->src[1]; + n->bc.src[1] = d1->bc.src[1]; + n->src[0] = d1->src[0]; + n->bc.src[0] = d1->bc.src[0]; + + n->bc.src[1].neg ^= mul_neg; + + fold_alu_op3(*n); + return true; + } + } + + return false; +} + +bool expr_handler::eval_const_op(unsigned op, literal &r, + literal cv0, literal cv1) { + + switch (op) { + case ALU_OP2_ADD: r = cv0.f + cv1.f; break; + case ALU_OP2_ADDC_UINT: + r = (uint32_t)(((uint64_t)cv0.u + cv1.u)>>32); break; + case ALU_OP2_ADD_INT: r = cv0.i + cv1.i; break; + case ALU_OP2_AND_INT: r = cv0.i & cv1.i; break; + case ALU_OP2_ASHR_INT: r = cv0.i >> (cv1.i & 0x1F); break; + case ALU_OP2_BFM_INT: + r = (((1 << (cv0.i & 0x1F)) - 1) << (cv1.i & 0x1F)); break; + case ALU_OP2_LSHL_INT: r = cv0.i << cv1.i; break; + case ALU_OP2_LSHR_INT: r = cv0.u >> cv1.u; break; + case ALU_OP2_MAX: + case ALU_OP2_MAX_DX10: r = cv0.f > cv1.f ? cv0.f : cv1.f; break; + case ALU_OP2_MAX_INT: r = cv0.i > cv1.i ? cv0.i : cv1.i; break; + case ALU_OP2_MAX_UINT: r = cv0.u > cv1.u ? cv0.u : cv1.u; break; + case ALU_OP2_MIN: + case ALU_OP2_MIN_DX10: r = cv0.f < cv1.f ? cv0.f : cv1.f; break; + case ALU_OP2_MIN_INT: r = cv0.i < cv1.i ? cv0.i : cv1.i; break; + case ALU_OP2_MIN_UINT: r = cv0.u < cv1.u ? cv0.u : cv1.u; break; + case ALU_OP2_MUL: + case ALU_OP2_MUL_IEEE: r = cv0.f * cv1.f; break; + case ALU_OP2_MULHI_INT: + r = (int32_t)(((int64_t)cv0.u * cv1.u)>>32); break; + case ALU_OP2_MULHI_UINT: + r = (uint32_t)(((uint64_t)cv0.u * cv1.u)>>32); break; + case ALU_OP2_MULLO_INT: + r = (int32_t)(((int64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; + case ALU_OP2_MULLO_UINT: + r = (uint32_t)(((uint64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; + case ALU_OP2_OR_INT: r = cv0.i | cv1.i; break; + case ALU_OP2_SUB_INT: r = cv0.i - cv1.i; break; + case ALU_OP2_XOR_INT: r = cv0.i ^ cv1.i; break; + + default: + return false; + } + + return true; +} + +// fold the chain of associative ops, e.g. (ADD 2, (ADD x, 3)) => (ADD x, 5) +bool expr_handler::fold_assoc(alu_node *n) { + + alu_node *a = n; + literal cr; + + int last_arg = -3; + + unsigned op = n->bc.op; + bool allow_neg = false, cur_neg = false; + + switch(op) { + case ALU_OP2_ADD: + case ALU_OP2_MUL: + case ALU_OP2_MUL_IEEE: + allow_neg = true; + break; + case ALU_OP3_MULADD: + allow_neg = true; + op = ALU_OP2_MUL; + break; + case ALU_OP3_MULADD_IEEE: + allow_neg = true; + op = ALU_OP2_MUL_IEEE; + break; + default: + if (n->bc.op_ptr->src_count != 2) + return false; + } + + // check if we can evaluate the op + if (!eval_const_op(op, cr, literal(0), literal(0))) + return false; + + while (true) { + + value *v0 = a->src[0]->gvalue(); + value *v1 = a->src[1]->gvalue(); + + last_arg = -2; + + if (v1->is_const()) { + literal arg = v1->get_const_value(); + apply_alu_src_mod(a->bc, 1, arg); + if (cur_neg) + arg.f = -arg.f; + + if (a == n) + cr = arg; + else + eval_const_op(op, cr, cr, arg); + + if (v0->def) { + alu_node *d0 = static_cast<alu_node*>(v0->def); + if ((d0->is_alu_op(op) || + (op == ALU_OP2_MUL_IEEE && + d0->is_alu_op(ALU_OP2_MUL))) && + !d0->bc.omod && !d0->bc.clamp && + (!a->bc.src[0].neg || allow_neg)) { + cur_neg ^= a->bc.src[0].neg; + a = d0; + continue; + } + } + last_arg = 0; + + } + + if (v0->is_const()) { + literal arg = v0->get_const_value(); + apply_alu_src_mod(a->bc, 0, arg); + if (cur_neg) + arg.f = -arg.f; + + if (last_arg == 0) { + eval_const_op(op, cr, cr, arg); + last_arg = -1; + break; + } + + if (a == n) + cr = arg; + else + eval_const_op(op, cr, cr, arg); + + if (v1->def) { + alu_node *d1 = static_cast<alu_node*>(v1->def); + if ((d1->is_alu_op(op) || + (op == ALU_OP2_MUL_IEEE && + d1->is_alu_op(ALU_OP2_MUL))) && + !d1->bc.omod && !d1->bc.clamp && + (!a->bc.src[1].neg || allow_neg)) { + cur_neg ^= a->bc.src[1].neg; + a = d1; + continue; + } + } + + last_arg = 1; + } + + break; + }; + + if (last_arg == -1) { + // result is const + apply_alu_dst_mod(n->bc, cr); + + if (n->bc.op == op) { + convert_to_mov(*n, sh.get_const_value(cr)); + fold_alu_op1(*n); + return true; + } else { // MULADD => ADD + n->src[0] = n->src[2]; + n->bc.src[0] = n->bc.src[2]; + n->src[1] = sh.get_const_value(cr); + memset(&n->bc.src[1], 0, sizeof(bc_alu_src)); + + n->src.resize(2); + n->bc.set_op(ALU_OP2_ADD); + } + } else if (last_arg >= 0) { + n->src[0] = a->src[last_arg]; + n->bc.src[0] = a->bc.src[last_arg]; + n->bc.src[0].neg ^= cur_neg; + n->src[1] = sh.get_const_value(cr); + memset(&n->bc.src[1], 0, sizeof(bc_alu_src)); + } + + return false; +} bool expr_handler::fold_alu_op2(alu_node& n) { @@ -464,11 +730,53 @@ bool expr_handler::fold_alu_op2(alu_node& n) { return fold_setcc(n); } + if (!sh.safe_math && (flags & AF_M_ASSOC)) { + if (fold_assoc(&n)) + return true; + } + value* v0 = n.src[0]->gvalue(); value* v1 = n.src[1]->gvalue(); assert(v0 && v1); + // handle some operations with equal args, e.g. x + x => x * 2 + if (v0 == v1) { + if (n.bc.src[0].neg == n.bc.src[1].neg && + n.bc.src[0].abs == n.bc.src[1].abs) { + switch (n.bc.op) { + case ALU_OP2_MIN: // (MIN x, x) => (MOV x) + case ALU_OP2_MAX: + convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs); + return fold_alu_op1(n); + case ALU_OP2_ADD: // (ADD x, x) => (MUL x, 2) + if (!sh.safe_math) { + n.src[1] = sh.get_const_value(2.0f); + memset(&n.bc.src[1], 0, sizeof(bc_alu_src)); + n.bc.set_op(ALU_OP2_MUL); + return fold_alu_op2(n); + } + break; + } + } + if (n.bc.src[0].neg != n.bc.src[1].neg && + n.bc.src[0].abs == n.bc.src[1].abs) { + switch (n.bc.op) { + case ALU_OP2_ADD: // (ADD x, -x) => (MOV 0) + if (!sh.safe_math) { + convert_to_mov(n, sh.get_const_value(literal(0))); + return fold_alu_op1(n); + } + break; + } + } + } + + if (n.bc.op == ALU_OP2_ADD) { + if (fold_mul_add(&n)) + return true; + } + bool isc0 = v0->is_const(); bool isc1 = v1->is_const(); @@ -488,42 +796,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) { } if (isc0 && isc1) { - switch (n.bc.op) { - case ALU_OP2_ADD: dv = cv0.f + cv1.f; break; - case ALU_OP2_ADDC_UINT: - dv = (uint32_t)(((uint64_t)cv0.u + cv1.u)>>32); break; - case ALU_OP2_ADD_INT: dv = cv0.i + cv1.i; break; - case ALU_OP2_AND_INT: dv = cv0.i & cv1.i; break; - case ALU_OP2_ASHR_INT: dv = cv0.i >> (cv1.i & 0x1F); break; - case ALU_OP2_BFM_INT: - dv = (((1 << (cv0.i & 0x1F)) - 1) << (cv1.i & 0x1F)); break; - case ALU_OP2_LSHL_INT: dv = cv0.i << cv1.i; break; - case ALU_OP2_LSHR_INT: dv = cv0.u >> cv1.u; break; - case ALU_OP2_MAX: - case ALU_OP2_MAX_DX10: dv = cv0.f > cv1.f ? cv0.f : cv1.f; break; - case ALU_OP2_MAX_INT: dv = cv0.i > cv1.i ? cv0.i : cv1.i; break; - case ALU_OP2_MAX_UINT: dv = cv0.u > cv1.u ? cv0.u : cv1.u; break; - case ALU_OP2_MIN: - case ALU_OP2_MIN_DX10: dv = cv0.f < cv1.f ? cv0.f : cv1.f; break; - case ALU_OP2_MIN_INT: dv = cv0.i < cv1.i ? cv0.i : cv1.i; break; - case ALU_OP2_MIN_UINT: dv = cv0.u < cv1.u ? cv0.u : cv1.u; break; - case ALU_OP2_MUL: - case ALU_OP2_MUL_IEEE: dv = cv0.f * cv1.f; break; - case ALU_OP2_MULHI_INT: - dv = (int32_t)(((int64_t)cv0.u * cv1.u)>>32); break; - case ALU_OP2_MULHI_UINT: - dv = (uint32_t)(((uint64_t)cv0.u * cv1.u)>>32); break; - case ALU_OP2_MULLO_INT: - dv = (int32_t)(((int64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; - case ALU_OP2_MULLO_UINT: - dv = (uint32_t)(((uint64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break; - case ALU_OP2_OR_INT: dv = cv0.i | cv1.i; break; - case ALU_OP2_SUB_INT: dv = cv0.i - cv1.i; break; - case ALU_OP2_XOR_INT: dv = cv0.i ^ cv1.i; break; - default: + if (!eval_const_op(n.bc.op, dv, cv0, cv1)) return false; - } } else { // one source is const @@ -649,6 +924,11 @@ bool expr_handler::fold_alu_op3(alu_node& n) { if (n.src.size() < 3) return false; + if (!sh.safe_math && (n.bc.op_ptr->flags & AF_M_ASSOC)) { + if (fold_assoc(&n)) + return true; + } + value* v0 = n.src[0]->gvalue(); value* v1 = n.src[1]->gvalue(); value* v2 = n.src[2]->gvalue(); @@ -698,6 +978,77 @@ bool expr_handler::fold_alu_op3(alu_node& n) { } } + // handle (MULADD a, x, MUL (x, b)) => (MUL x, ADD (a, b)) + if (!sh.safe_math && (n.bc.op == ALU_OP3_MULADD || + n.bc.op == ALU_OP3_MULADD_IEEE)) { + + unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ? + ALU_OP2_MUL_IEEE : ALU_OP2_MUL; + + if (!isc2 && v2->def && v2->def->is_alu_op(op)) { + + alu_node *md = static_cast<alu_node*>(v2->def); + value *mv0 = md->src[0]->gvalue(); + value *mv1 = md->src[1]->gvalue(); + + int es0 = -1, es1; + + if (v0 == mv0) { + es0 = 0; + es1 = 0; + } else if (v0 == mv1) { + es0 = 0; + es1 = 1; + } else if (v1 == mv0) { + es0 = 1; + es1 = 0; + } else if (v1 == mv1) { + es0 = 1; + es1 = 1; + } + + if (es0 != -1) { + value *va0 = es0 == 0 ? v1 : v0; + value *va1 = es1 == 0 ? mv1 : mv0; + + alu_node *add = sh.create_alu(); + add->bc.set_op(ALU_OP2_ADD); + + add->dst.resize(1); + add->src.resize(2); + + value *t = sh.create_temp_value(); + t->def = add; + add->dst[0] = t; + add->src[0] = va0; + add->src[1] = va1; + add->bc.src[0] = n.bc.src[!es0]; + add->bc.src[1] = md->bc.src[!es1]; + + add->bc.src[1].neg ^= n.bc.src[2].neg ^ + (n.bc.src[es0].neg != md->bc.src[es1].neg); + + n.insert_before(add); + vt.add_value(t); + + t = t->gvalue(); + + if (es0 == 1) { + n.src[0] = n.src[1]; + n.bc.src[0] = n.bc.src[1]; + } + + n.src[1] = t; + memset(&n.bc.src[1], 0, sizeof(bc_alu_src)); + + n.src.resize(2); + + n.bc.set_op(op); + return fold_alu_op2(n); + } + } + } + if (!isc0 && !isc1 && !isc2) return false; @@ -727,13 +1078,36 @@ bool expr_handler::fold_alu_op3(alu_node& n) { } } - if ((isc0 && cv0 == literal(0)) || (isc1 && cv1 == literal(0))) { - switch (n.bc.op) { - case ALU_OP3_MULADD: + if (n.bc.op == ALU_OP3_MULADD) { + if ((isc0 && cv0 == literal(0)) || (isc1 && cv1 == literal(0))) { convert_to_mov(n, n.src[2], n.bc.src[2].neg, n.bc.src[2].abs); return fold_alu_op1(n); } } + + if (n.bc.op == ALU_OP3_MULADD || n.bc.op == ALU_OP3_MULADD_IEEE) { + unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ? + ALU_OP2_MUL_IEEE : ALU_OP2_MUL; + + if (isc1 && v0 == v2) { + cv1.f += (n.bc.src[2].neg != n.bc.src[0].neg ? -1.0f : 1.0f); + n.src[1] = sh.get_const_value(cv1); + n.bc.src[1].neg = 0; + n.bc.src[1].abs = 0; + n.bc.set_op(op); + n.src.resize(2); + return fold_alu_op2(n); + } else if (isc0 && v1 == v2) { + cv0.f += (n.bc.src[2].neg != n.bc.src[1].neg ? -1.0f : 1.0f); + n.src[0] = sh.get_const_value(cv0); + n.bc.src[0].neg = 0; + n.bc.src[0].abs = 0; + n.bc.set_op(op); + n.src.resize(2); + return fold_alu_op2(n); + } + } + return false; } diff --git a/src/gallium/drivers/r600/sb/sb_expr.h b/src/gallium/drivers/r600/sb/sb_expr.h index 1b77c8e0657..f3c706802a4 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.h +++ b/src/gallium/drivers/r600/sb/sb_expr.h @@ -77,6 +77,10 @@ public: bool fold_alu_op2(alu_node &n); bool fold_alu_op3(alu_node &n); + bool fold_mul_add(alu_node *n); + bool eval_const_op(unsigned op, literal &r, literal cv0, literal cv1); + bool fold_assoc(alu_node *n); + static void apply_alu_src_mod(const bc_alu &bc, unsigned src, literal &v); static void apply_alu_dst_mod(const bc_alu &bc, literal &v); diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp index 4f5ce11fcdb..32760ec056f 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.cpp +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -39,7 +39,7 @@ shader::shader(sb_context &sctx, shader_target t, unsigned id) coal(*this), bbs(), target(t), vt(ex), ex(*this), root(), compute_interferences(), - has_alu_predication(), uses_gradients(), ngpr(), nstack() {} + has_alu_predication(), uses_gradients(), safe_math(), ngpr(), nstack() {} bool shader::assign_slot(alu_node* n, alu_node *slots[5]) { diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h index 5362e395e97..ca9d29eb808 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.h +++ b/src/gallium/drivers/r600/sb/sb_shader.h @@ -293,6 +293,8 @@ public: bool has_alu_predication; bool uses_gradients; + bool safe_math; + unsigned ngpr, nstack; shader(sb_context &sctx, shader_target t, unsigned id); |