summaryrefslogtreecommitdiffstats
path: root/src/gallium
diff options
context:
space:
mode:
authorIlia Mirkin <[email protected]>2015-12-11 00:40:15 -0500
committerIlia Mirkin <[email protected]>2015-12-12 18:10:16 -0500
commitdbca0f3eba632125904ded6298a87fefdde66d76 (patch)
treea8e521d0aad34d3a4c87b4e6c5a8cc5815bd0db2 /src/gallium
parent3af83c4bc7863e007ce47f6eb3606a1c59d14719 (diff)
nv50/ir: manually optimize multiplication expansion logic
The conversion of 32-bit integer multiplies into 16-bit ones happens after the regular optimization loop. However it's fairly common to multiply by a small integer, rendering some of the expansion pointless. Firstly, propagate immediates when possible into mul ops, secondly just remove the ops when they are unnecessary. Including the change to generate imad immediates, the effect is: total instructions in shared programs : 6365463 -> 6351898 (-0.21%) total gprs used in shared programs : 728684 -> 728684 (0.00%) total local used in shared programs : 9904 -> 9904 (0.00%) total bytes used in shared programs : 44001576 -> 44036120 (0.08%) local gpr inst bytes helped 0 0 3288 4 hurt 0 0 0 842 It's easy for this to hurt bytes since we end up always generating the 8-byte form, while we can't always get rid of the immediate in question. Signed-off-by: Ilia Mirkin <[email protected]>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp31
1 files changed, 25 insertions, 6 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 19965ff6940..8752b0c8c54 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -44,6 +44,8 @@ static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+ ImmediateValue src1;
+ bool src1imm = mul->src(1).getImmediate(src1);
DataType fTy; // full type
switch (mul->sType) {
@@ -72,24 +74,41 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
- s[0] = mul->getSrc(0);
- s[1] = mul->getSrc(1);
-
if (isSignedType(mul->sType) && highResult) {
s[0] = bld->getSSA(fullSize);
s[1] = bld->getSSA(fullSize);
bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+ src1.reg.data.s32 = abs(src1.reg.data.s32);
+ } else {
+ s[0] = mul->getSrc(0);
+ s[1] = mul->getSrc(1);
}
// split sources into halves
i[0] = bld->mkSplit(a, halfSize, s[0]);
i[1] = bld->mkSplit(b, halfSize, s[1]);
- i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
- i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
+ i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
+ bld->mkImm(src1.reg.data.u32 & 0xffff));
+ } else {
+ i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
+ src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
+ if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+ i[3] = i[2];
+ t[1] = t[0];
+ } else {
+ i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ }
+ }
i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
- i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+ if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+ i[4] = i[3];
+ t[3] = t[2];
+ } else {
+ i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+ }
if (highResult) {
Value *c[2];