diff options
author | Ilia Mirkin <[email protected]> | 2016-01-17 22:28:19 -0500 |
---|---|---|
committer | Ilia Mirkin <[email protected]> | 2016-01-18 17:52:07 -0500 |
commit | a31819cff8f4560786d731f5f1de6ba814368a2f (patch) | |
tree | 0fc7baadc78048c714a2a35f7e74b2b8b68eadbe /src/gallium/drivers/nouveau | |
parent | af686e7de3cc7cc053959729e9e31432740489fa (diff) |
nv50/ir: swap the least-ref'd source into src1 when both const/imm
The whole point of inlining sources is to reduce loads. We can end up in
a situation where one value is used a lot of times, and one value is
used only once per instruction. The once-per-instruction one is the one
that should get inlined, but with the previous algorithm, it was given
no preference.
This flips things around to preferring putting less-referenced values
into src1 which increases the likelihood of them being inlined.
While we're at it, adjust the heuristic to not treat 0 as an immediate,
as well as (effectively) check for situations where LIMMs can't be
loaded. All this yields improvements on nvc0:
total instructions in shared programs : 6261157 -> 6255985 (-0.08%)
total gprs used in shared programs : 945082 -> 943417 (-0.18%)
total local used in shared programs : 30372 -> 30288 (-0.28%)
total bytes used in shared programs : 50089256 -> 50047880 (-0.08%)
local gpr inst bytes
helped 21 822 3332 3332
hurt 0 278 565 565
And more importantly avoids generating really bad code with SSBOs, where
we end up checking a lot of different values (usually immediates) against
the length.
On nv50 we get comparable results, and even improve packing (bytes went
down more than instructions):
total instructions in shared programs : 6346564 -> 6341277 (-0.08%)
total gprs used in shared programs : 728719 -> 725131 (-0.49%)
total local used in shared programs : 3552 -> 3552 (0.00%)
total bytes used in shared programs : 43995688 -> 43932928 (-0.14%)
local gpr inst bytes
helped 0 1380 3252 3774
hurt 0 287 1710 1365
Signed-off-by: Ilia Mirkin <[email protected]>
Diffstat (limited to 'src/gallium/drivers/nouveau')
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp | 25 |
1 files changed, 15 insertions, 10 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index f5c590eef10..75773553840 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -171,7 +171,10 @@ LoadPropagation::isImmdLoad(Instruction *ld) if (!ld || (ld->op != OP_MOV) || ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8))) return false; - return ld->src(0).getFile() == FILE_IMMEDIATE; + + // A 0 can be replaced with a register, so it doesn't count as an immediate. + ImmediateValue val; + return ld->src(0).getImmediate(val) && !val.isInteger(0); } bool @@ -187,7 +190,8 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld) void LoadPropagation::checkSwapSrc01(Instruction *insn) { - if (!prog->getTarget()->getOpInfo(insn).commutative) + const Target *targ = prog->getTarget(); + if (!targ->getOpInfo(insn).commutative) if (insn->op != OP_SET && insn->op != OP_SLCT) return; if (insn->src(1).getFile() != FILE_GPR) @@ -196,14 +200,15 @@ LoadPropagation::checkSwapSrc01(Instruction *insn) Instruction *i0 = insn->getSrc(0)->getInsn(); Instruction *i1 = insn->getSrc(1)->getInsn(); - if (isCSpaceLoad(i0)) { - if (!isCSpaceLoad(i1)) - insn->swapSources(0, 1); - else - return; - } else - if (isImmdLoad(i0)) { - if (!isCSpaceLoad(i1) && !isImmdLoad(i1)) + // Swap sources to inline the less frequently used source. That way, + // optimistically, it will eventually be able to remove the instruction. + int i0refs = insn->getSrc(0)->refCount(); + int i1refs = insn->getSrc(1)->refCount(); + + if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) { + if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) || + !targ->insnCanLoad(insn, 1, i1) || + i0refs < i1refs) insn->swapSources(0, 1); else return; |