diff options
Diffstat (limited to 'src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp')
-rw-r--r-- | src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp | 2192 |
1 files changed, 2192 insertions, 0 deletions
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp new file mode 100644 index 00000000000..bd331ea8f03 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp @@ -0,0 +1,2192 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" +#include "nv50_ir_build_util.h" + +extern "C" { +#include "util/u_math.h" +} + +namespace nv50_ir { + +bool +Instruction::isNop() const +{ + if (op == OP_CONSTRAINT || op == OP_PHI) + return true; + if (terminator || join) // XXX: should terminator imply flow ? + return false; + if (!fixed && op == OP_NOP) + return true; + + if (def[0].exists() && def[0].rep()->reg.data.id < 0) { + for (int d = 1; defExists(d); ++d) + if (def[d].rep()->reg.data.id >= 0) + WARN("part of vector result is unused !\n"); + return true; + } + + if (op == OP_MOV || op == OP_UNION) { + if (!def[0].rep()->equals(getSrc(0))) + return false; + if (op == OP_UNION) + if (!def[0].rep()->equals(getSrc(1))) + return false; + return true; + } + + return false; +} + +bool Instruction::isDead() const +{ + if (op == OP_STORE || + op == OP_EXPORT) + return false; + + for (int d = 0; defExists(d); ++d) + if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0) + return false; + + if (terminator || asFlow()) + return false; + if (fixed) + return false; + + return true; +}; + +// ============================================================================= + +class CopyPropagation : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +// Propagate all MOVs forward to make subsequent optimization easier, except if +// the sources stem from a phi, in which case we don't want to mess up potential +// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def. +bool +CopyPropagation::visit(BasicBlock *bb) +{ + Instruction *mov, *si, *next; + + for (mov = bb->getEntry(); mov; mov = next) { + next = mov->next; + if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue()) + continue; + si = mov->getSrc(0)->getInsn(); + if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) { + // propagate + mov->def[0].replace(mov->getSrc(0), false); + delete_Instruction(prog, mov); + } + } + return true; +} + +// ============================================================================= + +class LoadPropagation : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + void checkSwapSrc01(Instruction *); + + bool isCSpaceLoad(Instruction *); + bool isImmd32Load(Instruction *); +}; + +bool +LoadPropagation::isCSpaceLoad(Instruction *ld) +{ + return ld && ld->op == OP_LOAD && ld->src[0].getFile() == FILE_MEMORY_CONST; +} + +bool +LoadPropagation::isImmd32Load(Instruction *ld) +{ + if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4)) + return false; + return ld->src[0].getFile() == FILE_IMMEDIATE; +} + +void +LoadPropagation::checkSwapSrc01(Instruction *insn) +{ + if (!prog->getTarget()->getOpInfo(insn).commutative) + if (insn->op != OP_SET && insn->op != OP_SLCT) + return; + if (insn->src[1].getFile() != FILE_GPR) + return; + + Instruction *i0 = insn->getSrc(0)->getInsn(); + Instruction *i1 = insn->getSrc(1)->getInsn(); + + if (isCSpaceLoad(i0)) { + if (!isCSpaceLoad(i1)) + insn->swapSources(0, 1); + else + return; + } else + if (isImmd32Load(i0)) { + if (!isCSpaceLoad(i1) && !isImmd32Load(i1)) + insn->swapSources(0, 1); + else + return; + } else { + return; + } + + if (insn->op == OP_SET) + insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond); + else + if (insn->op == OP_SLCT) + insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond); +} + +bool +LoadPropagation::visit(BasicBlock *bb) +{ + const Target *targ = prog->getTarget(); + Instruction *next; + + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + + if (i->srcExists(1)) + checkSwapSrc01(i); + + for (int s = 0; i->srcExists(s); ++s) { + Instruction *ld = i->getSrc(s)->getInsn(); + + if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV)) + continue; + if (!targ->insnCanLoad(i, s, ld)) + continue; + + // propagate ! + i->setSrc(s, ld->getSrc(0)); + if (ld->src[0].isIndirect(0)) + i->setIndirect(s, 0, ld->getIndirect(0, 0)); + + if (ld->getDef(0)->refCount() == 0) + delete_Instruction(prog, ld); + } + } + return true; +} + +// ============================================================================= + +// Evaluate constant expressions. +class ConstantFolding : public Pass +{ +public: + bool foldAll(Program *); + +private: + virtual bool visit(BasicBlock *); + + void expr(Instruction *, ImmediateValue *, ImmediateValue *); + void opnd(Instruction *, ImmediateValue *, int s); + + void unary(Instruction *, const ImmediateValue&); + + // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET + CmpInstruction *findOriginForTestWithZero(Value *); + + unsigned int foldCount; + + BuildUtil bld; +}; + +// TODO: remember generated immediates and only revisit these +bool +ConstantFolding::foldAll(Program *prog) +{ + unsigned int iterCount = 0; + do { + foldCount = 0; + if (!run(prog)) + return false; + } while (foldCount && ++iterCount < 2); + return true; +} + +bool +ConstantFolding::visit(BasicBlock *bb) +{ + Instruction *i, *next; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + if (i->op == OP_MOV) // continue early, MOV appears frequently + continue; + + ImmediateValue *src0 = i->src[0].getImmediate(); + ImmediateValue *src1 = i->src[1].getImmediate(); + + if (src0 && src1) + expr(i, src0, src1); + else + if (src0) + opnd(i, src0, 0); + else + if (src1) + opnd(i, src1, 1); + } + return true; +} + +CmpInstruction * +ConstantFolding::findOriginForTestWithZero(Value *value) +{ + if (!value) + return NULL; + Instruction *insn = value->getInsn(); + + while (insn && insn->op != OP_SET) { + Instruction *next = NULL; + switch (insn->op) { + case OP_NEG: + case OP_ABS: + case OP_CVT: + next = insn->getSrc(0)->getInsn(); + if (insn->sType != next->dType) + return NULL; + break; + case OP_MOV: + next = insn->getSrc(0)->getInsn(); + break; + default: + return NULL; + } + insn = next; + } + return insn ? insn->asCmp() : NULL; +} + +void +Modifier::applyTo(ImmediateValue& imm) const +{ + switch (imm.reg.type) { + case TYPE_F32: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.f32 = fabsf(imm.reg.data.f32); + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.f32 = -imm.reg.data.f32; + if (bits & NV50_IR_MOD_SAT) { + if (imm.reg.data.f32 < 0.0f) + imm.reg.data.f32 = 0.0f; + else + if (imm.reg.data.f32 > 1.0f) + imm.reg.data.f32 = 1.0f; + } + assert(!(bits & NV50_IR_MOD_NOT)); + break; + + case TYPE_S8: // NOTE: will be extended + case TYPE_S16: + case TYPE_S32: + case TYPE_U8: // NOTE: treated as signed + case TYPE_U16: + case TYPE_U32: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ? + imm.reg.data.s32 : -imm.reg.data.s32; + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.s32 = -imm.reg.data.s32; + if (bits & NV50_IR_MOD_NOT) + imm.reg.data.s32 = ~imm.reg.data.s32; + break; + + case TYPE_F64: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.f64 = fabs(imm.reg.data.f64); + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.f64 = -imm.reg.data.f64; + if (bits & NV50_IR_MOD_SAT) { + if (imm.reg.data.f64 < 0.0) + imm.reg.data.f64 = 0.0; + else + if (imm.reg.data.f64 > 1.0) + imm.reg.data.f64 = 1.0; + } + assert(!(bits & NV50_IR_MOD_NOT)); + break; + + default: + assert(!"invalid/unhandled type"); + imm.reg.data.u64 = 0; + break; + } +} + +operation +Modifier::getOp() const +{ + switch (bits) { + case NV50_IR_MOD_ABS: return OP_ABS; + case NV50_IR_MOD_NEG: return OP_NEG; + case NV50_IR_MOD_SAT: return OP_SAT; + case NV50_IR_MOD_NOT: return OP_NOT; + case 0: + return OP_MOV; + default: + return OP_CVT; + } +} + +void +ConstantFolding::expr(Instruction *i, + ImmediateValue *src0, ImmediateValue *src1) +{ + ImmediateValue imm0(src0, i->sType); + ImmediateValue imm1(src1, i->sType); + struct Storage res; + struct Storage *const a = &imm0.reg, *const b = &imm1.reg; + + i->src[0].mod.applyTo(imm0); + i->src[1].mod.applyTo(imm1); + + switch (i->op) { + case OP_MAD: + case OP_FMA: + case OP_MUL: + if (i->dnz && i->dType == TYPE_F32) { + if (!isfinite(a->data.f32)) + a->data.f32 = 0.0f; + if (!isfinite(b->data.f32)) + b->data.f32 = 0.0f; + } + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break; + case TYPE_S32: + case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break; + default: + return; + } + break; + case OP_DIV: + if (b->data.u32 == 0) + break; + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break; + case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break; + case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break; + default: + return; + } + break; + case OP_ADD: + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break; + case TYPE_S32: + case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break; + default: + return; + } + break; + case OP_POW: + switch (i->dType) { + case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break; + default: + return; + } + break; + case OP_MAX: + switch (i->dType) { + case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break; + case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break; + case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break; + default: + return; + } + break; + case OP_MIN: + switch (i->dType) { + case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break; + case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break; + case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break; + default: + return; + } + break; + case OP_AND: + res.data.u64 = a->data.u64 & b->data.u64; + break; + case OP_OR: + res.data.u64 = a->data.u64 | b->data.u64; + break; + case OP_XOR: + res.data.u64 = a->data.u64 ^ b->data.u64; + break; + case OP_SHL: + res.data.u32 = a->data.u32 << b->data.u32; + break; + case OP_SHR: + switch (i->dType) { + case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break; + case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break; + default: + return; + } + break; + case OP_SLCT: + if (a->data.u32 != b->data.u32) + return; + res.data.u32 = a->data.u32; + break; + default: + return; + } + ++foldCount; + + i->src[0].mod = Modifier(0); + i->src[1].mod = Modifier(0); + + i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32)); + i->setSrc(1, NULL); + + i->getSrc(0)->reg.data = res.data; + + if (i->op == OP_MAD || i->op == OP_FMA) { + i->op = OP_ADD; + + i->setSrc(1, i->getSrc(0)); + i->setSrc(0, i->getSrc(2)); + i->setSrc(2, NULL); + + i->src[1].mod = i->src[2].mod; + + src0 = i->src[0].getImmediate(); + if (src0) + expr(i, src0, i->getSrc(1)->asImm()); + } else { + i->op = OP_MOV; + } +} + +void +ConstantFolding::unary(Instruction *i, const ImmediateValue &imm) +{ + Storage res; + + if (i->dType != TYPE_F32) + return; + switch (i->op) { + case OP_NEG: res.data.f32 = -imm.reg.data.f32; break; + case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break; + case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break; + case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break; + case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break; + case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break; + case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break; + case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break; + case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break; + case OP_PRESIN: + case OP_PREEX2: + // these should be handled in subsequent OP_SIN/COS/EX2 + res.data.f32 = imm.reg.data.f32; + break; + default: + return; + } + i->op = OP_MOV; + i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32)); + i->src[0].mod = Modifier(0); +} + +void +ConstantFolding::opnd(Instruction *i, ImmediateValue *src, int s) +{ + const int t = !s; + const operation op = i->op; + + ImmediateValue imm(src, i->sType); + + i->src[s].mod.applyTo(imm); + + switch (i->op) { + case OP_MUL: + if (i->dType == TYPE_F32 && i->getSrc(t)->refCount() == 1) { + Instruction *si = i->getSrc(t)->getUniqueInsn(); + + if (si && si->op == OP_MUL) { + float f = imm.reg.data.f32; + + if (si->src[1].getImmediate()) { + f *= si->src[1].getImmediate()->reg.data.f32; + si->setSrc(1, new_ImmediateValue(prog, f)); + i->def[0].replace(i->getSrc(t), false); + break; + } else { + int fac; + if (f == 0.125f) fac = -3; + else + if (f == 0.250f) fac = -2; + else + if (f == 0.500f) fac = -1; + else + if (f == 2.000f) fac = +1; + else + if (f == 4.000f) fac = +2; + else + if (f == 8.000f) fac = +3; + else + fac = 0; + if (fac) { + // FIXME: allowed & modifier + si->postFactor = fac; + i->def[0].replace(i->getSrc(t), false); + break; + } + } + } + } + if (imm.isInteger(0)) { + i->op = OP_MOV; + i->setSrc(0, i->getSrc(s)); + i->setSrc(1, NULL); + } else + if (imm.isInteger(1) || imm.isInteger(-1)) { + if (imm.isNegative()) + i->src[t].mod = i->src[t].mod ^ Modifier(NV50_IR_MOD_NEG); + i->op = i->src[t].mod.getOp(); + if (s == 0) { + i->setSrc(0, i->getSrc(1)); + i->src[0].mod = i->src[1].mod; + i->src[1].mod = 0; + } + if (i->op != OP_CVT) + i->src[0].mod = 0; + i->setSrc(1, NULL); + } else + if (imm.isInteger(2) || imm.isInteger(-2)) { + if (imm.isNegative()) + i->src[t].mod = i->src[t].mod ^ Modifier(NV50_IR_MOD_NEG); + i->op = OP_ADD; + i->setSrc(s, i->getSrc(t)); + i->src[s].mod = i->src[t].mod; + } else + if (!isFloatType(i->sType) && !imm.isNegative() && imm.isPow2()) { + i->op = OP_SHL; + imm.applyLog2(); + i->setSrc(1, new_ImmediateValue(prog, imm.reg.data.u32)); + } + break; + case OP_ADD: + if (imm.isInteger(0)) { + if (s == 0) { + i->setSrc(0, i->getSrc(1)); + i->src[0].mod = i->src[1].mod; + } + i->setSrc(1, NULL); + i->op = i->src[0].mod.getOp(); + if (i->op != OP_CVT) + i->src[0].mod = Modifier(0); + } + break; + + case OP_DIV: + if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32)) + break; + bld.setPosition(i, false); + if (imm.reg.data.u32 == 0) { + break; + } else + if (imm.reg.data.u32 == 1) { + i->op = OP_MOV; + i->setSrc(1, NULL); + } else + if (i->dType == TYPE_U32 && imm.isPow2()) { + i->op = OP_SHL; + i->setSrc(1, bld.mkImm(util_logbase2(imm.reg.data.u32))); + } else + if (i->dType == TYPE_U32) { + Instruction *mul; + Value *tA, *tB; + const uint32_t d = imm.reg.data.u32; + uint32_t m; + int r, s; + uint32_t l = util_logbase2(d); + if (((uint32_t)1 << l) < d) + ++l; + m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1; + r = l ? 1 : 0; + s = l ? (l - 1) : 0; + + tA = bld.getSSA(); + tB = bld.getSSA(); + mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0), + bld.loadImm(NULL, m)); + mul->subOp = NV50_IR_SUBOP_MUL_HIGH; + bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA); + tA = bld.getSSA(); + if (r) + bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r)); + else + tA = tB; + tB = s ? bld.getSSA() : i->getDef(0); + bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA); + if (s) + bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); + + delete_Instruction(prog, i); + } else + if (imm.reg.data.s32 == -1) { + i->op = OP_NEG; + i->setSrc(1, NULL); + } else { + LValue *tA, *tB; + LValue *tD; + const int32_t d = imm.reg.data.s32; + int32_t m; + int32_t l = util_logbase2(static_cast<unsigned>(abs(d))); + if ((1 << l) < abs(d)) + ++l; + if (!l) + l = 1; + m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32); + + tA = bld.getSSA(); + tB = bld.getSSA(); + bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m), + i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH; + if (l > 1) + bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1)); + else + tB = tA; + tA = bld.getSSA(); + bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0)); + tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue(); + bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA); + if (d < 0) + bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB); + + delete_Instruction(prog, i); + } + break; + + case OP_SET: // TODO: SET_AND,OR,XOR + { + CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t)); + CondCode cc, ccZ; + if (i->src[t].mod != Modifier(0)) + return; + if (imm.reg.data.u32 != 0 || !si || si->op != OP_SET) + return; + cc = si->setCond; + ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U); + if (s == 0) + ccZ = reverseCondCode(ccZ); + switch (ccZ) { + case CC_LT: cc = CC_FL; break; + case CC_GE: cc = CC_TR; break; + case CC_EQ: cc = inverseCondCode(cc); break; + case CC_LE: cc = inverseCondCode(cc); break; + case CC_GT: break; + case CC_NE: break; + default: + return; + } + i->asCmp()->setCond = cc; + i->setSrc(0, si->src[0]); + i->setSrc(1, si->src[1]); + i->sType = si->sType; + } + break; + + case OP_SHL: + { + if (s != 1 || i->src[0].mod != Modifier(0)) + break; + // try to concatenate shifts + Instruction *si = i->getSrc(0)->getInsn(); + if (!si || + si->op != OP_SHL || si->src[1].mod != Modifier(0)) + break; + ImmediateValue *siImm = si->src[1].getImmediate(); + if (siImm) { + bld.setPosition(i, false); + i->setSrc(0, si->getSrc(0)); + i->setSrc(1, bld.loadImm(NULL, + imm.reg.data.u32 + siImm->reg.data.u32)); + } + } + break; + + case OP_ABS: + case OP_NEG: + case OP_LG2: + case OP_RCP: + case OP_SQRT: + case OP_RSQ: + case OP_PRESIN: + case OP_SIN: + case OP_COS: + case OP_PREEX2: + case OP_EX2: + unary(i, imm); + break; + default: + return; + } + if (i->op != op) + foldCount++; +} + +// ============================================================================= + +// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed. +class ModifierFolding : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +bool +ModifierFolding::visit(BasicBlock *bb) +{ + const Target *target = prog->getTarget(); + + Instruction *i, *next, *mi; + Modifier mod; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + + if (0 && i->op == OP_SUB) { + // turn "sub" into "add neg" (do we really want this ?) + i->op = OP_ADD; + i->src[0].mod = i->src[0].mod ^ Modifier(NV50_IR_MOD_NEG); + } + + for (int s = 0; s < 3 && i->srcExists(s); ++s) { + mi = i->getSrc(s)->getInsn(); + if (!mi || + mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8) + continue; + if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) { + if ((i->op != OP_ADD && + i->op != OP_MUL) || + (mi->op != OP_ABS && + mi->op != OP_NEG)) + continue; + } else + if (i->sType != mi->dType) { + continue; + } + if ((mod = Modifier(mi->op)) == Modifier(0)) + continue; + mod = mod * mi->src[0].mod; + + if ((i->op == OP_ABS) || i->src[s].mod.abs()) { + // abs neg [abs] = abs + mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)); + } else + if ((i->op == OP_NEG) && mod.neg()) { + assert(s == 0); + // neg as both opcode and modifier on same insn is prohibited + // neg neg abs = abs, neg neg = identity + mod = mod & Modifier(~NV50_IR_MOD_NEG); + i->op = mod.getOp(); + mod = mod & Modifier(~NV50_IR_MOD_ABS); + if (mod == Modifier(0)) + i->op = OP_MOV; + } + + if (target->isModSupported(i, s, mod)) { + i->setSrc(s, mi->getSrc(0)); + i->src[s].mod = i->src[s].mod * mod; + } + } + + if (i->op == OP_SAT) { + mi = i->getSrc(0)->getInsn(); + if (mi && + mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) { + mi->saturate = 1; + mi->setDef(0, i->getDef(0)); + delete_Instruction(prog, i); + } + } + } + + return true; +} + +// ============================================================================= + +// MUL + ADD -> MAD/FMA +// MIN/MAX(a, a) -> a, etc. +// SLCT(a, b, const) -> cc(const) ? a : b +// RCP(RCP(a)) -> a +// MUL(MUL(a, b), const) -> MUL_Xconst(a, b) +class AlgebraicOpt : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + void handleADD(Instruction *); + void handleMINMAX(Instruction *); + void handleRCP(Instruction *); + void handleSLCT(Instruction *); + void handleLOGOP(Instruction *); + void handleCVT(Instruction *); +}; + +void +AlgebraicOpt::handleADD(Instruction *add) +{ + Value *src0 = add->getSrc(0); + Value *src1 = add->getSrc(1); + Value *src; + int s; + Modifier mod[4]; + + if (!prog->getTarget()->isOpSupported(OP_MAD, add->dType)) + return; + + if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) + return; + + if (src0->refCount() == 1 && + src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_MUL) + s = 0; + else + if (src1->refCount() == 1 && + src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_MUL) + s = 1; + else + return; + + if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) || + (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb)) + return; + + src = add->getSrc(s); + + mod[0] = add->src[0].mod; + mod[1] = add->src[1].mod; + mod[2] = src->getUniqueInsn()->src[0].mod; + mod[3] = src->getUniqueInsn()->src[1].mod; + + if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & Modifier(~NV50_IR_MOD_NEG)) + return; + + add->op = OP_MAD; + add->subOp = src->getInsn()->subOp; // potentially mul-high + + add->setSrc(2, add->src[s ? 0 : 1]); + + add->setSrc(0, src->getInsn()->getSrc(0)); + add->src[0].mod = mod[2] ^ mod[s]; + add->setSrc(1, src->getInsn()->getSrc(1)); + add->src[1].mod = mod[3]; +} + +void +AlgebraicOpt::handleMINMAX(Instruction *minmax) +{ + Value *src0 = minmax->getSrc(0); + Value *src1 = minmax->getSrc(1); + + if (src0 != src1 || src0->reg.file != FILE_GPR) + return; + if (minmax->src[0].mod == minmax->src[1].mod) { + if (minmax->src[0].mod) { + minmax->op = OP_CVT; + minmax->setSrc(1, NULL); + } else { + minmax->def[0].replace(minmax->getSrc(0), false); + minmax->bb->remove(minmax); + } + } else { + // TODO: + // min(x, -x) = -abs(x) + // min(x, -abs(x)) = -abs(x) + // min(x, abs(x)) = x + // max(x, -abs(x)) = x + // max(x, abs(x)) = abs(x) + // max(x, -x) = abs(x) + } +} + +void +AlgebraicOpt::handleRCP(Instruction *rcp) +{ + Instruction *si = rcp->getSrc(0)->getUniqueInsn(); + + if (si && si->op == OP_RCP) { + Modifier mod = rcp->src[0].mod * si->src[0].mod; + rcp->op = mod.getOp(); + rcp->setSrc(0, si->getSrc(0)); + } +} + +void +AlgebraicOpt::handleSLCT(Instruction *slct) +{ + if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) { + if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f)) + slct->setSrc(0, slct->getSrc(1)); + } else + if (slct->getSrc(0) != slct->getSrc(1)) { + return; + } + slct->op = OP_MOV; + slct->setSrc(1, NULL); + slct->setSrc(2, NULL); +} + +void +AlgebraicOpt::handleLOGOP(Instruction *logop) +{ + Value *src0 = logop->getSrc(0); + Value *src1 = logop->getSrc(1); + + if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) + return; + + if (src0 == src1) { + if (logop->src[0].mod != Modifier(0) || + logop->src[1].mod != Modifier(0)) + return; + if (logop->op == OP_AND || logop->op == OP_OR) { + logop->def[0].replace(logop->getSrc(0), false); + delete_Instruction(prog, logop); + } + } else { + // try AND(SET, SET) -> SET_AND(SET) + Instruction *set0 = src0->getInsn(); + Instruction *set1 = src1->getInsn(); + + if (!set0 || set0->fixed || !set1 || set1->fixed) + return; + if (set1->op != OP_SET) { + Instruction *xchg = set0; + set0 = set1; + set1 = xchg; + if (set1->op != OP_SET) + return; + } + if (set0->op != OP_SET && + set0->op != OP_SET_AND && + set0->op != OP_SET_OR && + set0->op != OP_SET_XOR) + return; + if (set0->getDef(0)->refCount() > 1 && + set1->getDef(0)->refCount() > 1) + return; + if (set0->getPredicate() || set1->getPredicate()) + return; + // check that they don't source each other + for (int s = 0; s < 2; ++s) + if (set0->getSrc(s) == set1->getDef(0) || + set1->getSrc(s) == set0->getDef(0)) + return; + + set0 = set0->clone(true); + set1 = set1->clone(false); + logop->bb->insertAfter(logop, set1); + logop->bb->insertAfter(logop, set0); + + set0->dType = TYPE_U8; + set0->getDef(0)->reg.file = FILE_PREDICATE; + set0->getDef(0)->reg.size = 1; + set1->setSrc(2, set0->getDef(0)); + switch (logop->op) { + case OP_AND: set1->op = OP_SET_AND; break; + case OP_OR: set1->op = OP_SET_OR; break; + case OP_XOR: set1->op = OP_SET_XOR; break; + default: + assert(0); + break; + } + set1->setDef(0, logop->getDef(0)); + delete_Instruction(prog, logop); + } +} + +// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0 +void +AlgebraicOpt::handleCVT(Instruction *cvt) +{ + if (cvt->sType != TYPE_F32 || + cvt->dType != TYPE_S32 || cvt->src[0].mod != Modifier(0)) + return; + Instruction *insn = cvt->getSrc(0)->getInsn(); + if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32) + return; + if (insn->src[0].mod != Modifier(0)) + return; + insn = insn->getSrc(0)->getInsn(); + if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) + return; + + Instruction *bset = insn->clone(false); + bset->dType = TYPE_U32; + bset->setDef(0, cvt->getDef(0)); + cvt->bb->insertAfter(cvt, bset); + delete_Instruction(prog, cvt); +} + +bool +AlgebraicOpt::visit(BasicBlock *bb) +{ + Instruction *next; + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + switch (i->op) { + case OP_ADD: + handleADD(i); + break; + case OP_RCP: + handleRCP(i); + break; + case OP_MIN: + case OP_MAX: + handleMINMAX(i); + break; + case OP_SLCT: + handleSLCT(i); + break; + case OP_AND: + case OP_OR: + case OP_XOR: + handleLOGOP(i); + break; + case OP_CVT: + handleCVT(i); + break; + default: + break; + } + } + + return true; +} + +// ============================================================================= + +static inline void +updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn) +{ + if (offset != ldst->getSrc(0)->reg.data.offset) { + if (ldst->getSrc(0)->refCount() > 1) + ldst->setSrc(0, ldst->getSrc(0)->clone(fn)); + ldst->getSrc(0)->reg.data.offset = offset; + } +} + +// Combine loads and stores, forward stores to loads where possible. +class MemoryOpt : public Pass +{ +private: + class Record + { + public: + Record *next; + Instruction *insn; + const Value *rel[2]; + const Value *base; + int32_t offset; + int8_t fileIndex; + uint8_t size; + bool locked; + Record *prev; + + bool overlaps(const Instruction *ldst) const; + + inline void link(Record **); + inline void unlink(Record **); + inline void set(const Instruction *ldst); + }; + +public: + MemoryOpt(); + + Record *loads[DATA_FILE_COUNT]; + Record *stores[DATA_FILE_COUNT]; + + MemoryPool recordPool; + +private: + virtual bool visit(BasicBlock *); + bool runOpt(BasicBlock *); + + Record **getList(const Instruction *); + + Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const; + + // merge @insn into load/store instruction from @rec + bool combineLd(Record *rec, Instruction *ld); + bool combineSt(Record *rec, Instruction *st); + + bool replaceLdFromLd(Instruction *ld, Record *ldRec); + bool replaceLdFromSt(Instruction *ld, Record *stRec); + bool replaceStFromSt(Instruction *restrict st, Record *stRec); + + void addRecord(Instruction *ldst); + void purgeRecords(Instruction *const st, DataFile); + void lockStores(Instruction *const ld); + void reset(); + +private: + Record *prevRecord; +}; + +MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6) +{ + for (int i = 0; i < DATA_FILE_COUNT; ++i) { + loads[i] = NULL; + stores[i] = NULL; + } + prevRecord = NULL; +} + +void +MemoryOpt::reset() +{ + for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) { + Record *it, *next; + for (it = loads[i]; it; it = next) { + next = it->next; + recordPool.release(it); + } + loads[i] = NULL; + for (it = stores[i]; it; it = next) { + next = it->next; + recordPool.release(it); + } + stores[i] = NULL; + } +} + +bool +MemoryOpt::combineLd(Record *rec, Instruction *ld) +{ + int32_t offRc = rec->offset; + int32_t offLd = ld->getSrc(0)->reg.data.offset; + int sizeRc = rec->size; + int sizeLd = typeSizeof(ld->dType); + int size = sizeRc + sizeLd; + int d, j; + + // only VFETCH can do a 96 byte load + if (ld->op != OP_VFETCH && size == 12) + return false; + // no unaligned loads + if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) || + ((size == 0xc) && (MIN2(offLd, offRc) & 0xf))) + return false; + + assert(sizeRc + sizeLd <= 16 && offRc != offLd); + + for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j); + + if (offLd < offRc) { + int sz; + for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d); + // d: nr of definitions in ld + // j: nr of definitions in rec->insn, move: + for (d = d + j - 1; j > 0; --j, --d) + rec->insn->setDef(d, rec->insn->getDef(j - 1)); + + if (rec->insn->getSrc(0)->refCount() > 1) + rec->insn->setSrc(0, rec->insn->getSrc(0)->clone(func)); + rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd; + + d = 0; + } else { + d = j; + } + // move definitions of @ld to @rec->insn + for (j = 0; sizeLd; ++j, ++d) { + sizeLd -= ld->getDef(j)->reg.size; + rec->insn->setDef(d, ld->getDef(j)); + } + + rec->size = size; + rec->insn->setType(typeOfSize(size)); + + delete_Instruction(prog, ld); + + return true; +} + +bool +MemoryOpt::combineSt(Record *rec, Instruction *st) +{ + int32_t offRc = rec->offset; + int32_t offSt = st->getSrc(0)->reg.data.offset; + int sizeRc = rec->size; + int sizeSt = typeSizeof(st->dType); + int s = sizeSt / 4; + int size = sizeRc + sizeSt; + int j, k; + Value *src[4]; // no modifiers in ValueRef allowed for st + Value *extra[3]; + + if (size == 12) // XXX: check if EXPORT a[] can do this after all + return false; + if (size == 8 && MIN2(offRc, offSt) & 0x7) + return false; + + st->takeExtraSources(0, extra); // save predicate and indirect address + + if (offRc < offSt) { + // save values from @st + for (s = 0; sizeSt; ++s) { + sizeSt -= st->getSrc(s + 1)->reg.size; + src[s] = st->getSrc(s + 1); + } + // set record's values as low sources of @st + for (j = 1; sizeRc; ++j) { + sizeRc -= st->getSrc(j)->reg.size; + st->setSrc(j, rec->insn->getSrc(j)); + } + // set saved values as high sources of @st + for (k = j, j = 0; j < s; ++j) + st->setSrc(k++, src[j]); + + updateLdStOffset(st, offRc, func); + } else { + for (j = 1; sizeSt; ++j) + sizeSt -= st->getSrc(j)->reg.size; + for (s = 1; sizeRc; ++j, ++s) { + sizeRc -= rec->insn->getSrc(s)->reg.size; + st->setSrc(j, rec->insn->getSrc(s)); + } + rec->offset = offSt; + } + st->putExtraSources(0, extra); // restore pointer and predicate + + delete_Instruction(prog, rec->insn); + rec->insn = st; + rec->size = size; + rec->insn->setType(typeOfSize(size)); + return true; +} + +void +MemoryOpt::Record::set(const Instruction *ldst) +{ + const Symbol *mem = ldst->getSrc(0)->asSym(); + fileIndex = mem->reg.fileIndex; + rel[0] = ldst->getIndirect(0, 0); + rel[1] = ldst->getIndirect(0, 1); + offset = mem->reg.data.offset; + base = mem->getBase(); + size = typeSizeof(ldst->sType); +} + +void +MemoryOpt::Record::link(Record **list) +{ + next = *list; + if (next) + next->prev = this; + prev = NULL; + *list = this; +} + +void +MemoryOpt::Record::unlink(Record **list) +{ + if (next) + next->prev = prev; + if (prev) + prev->next = next; + else + *list = next; +} + +MemoryOpt::Record ** +MemoryOpt::getList(const Instruction *insn) +{ + if (insn->op == OP_LOAD || insn->op == OP_VFETCH) + return &loads[insn->src[0].getFile()]; + return &stores[insn->src[0].getFile()]; +} + +void +MemoryOpt::addRecord(Instruction *i) +{ + Record **list = getList(i); + Record *it = reinterpret_cast<Record *>(recordPool.allocate()); + + it->link(list); + it->set(i); + it->insn = i; + it->locked = false; +} + +MemoryOpt::Record * +MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const +{ + const Symbol *sym = insn->getSrc(0)->asSym(); + const int size = typeSizeof(insn->sType); + Record *rec = NULL; + Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file]; + + for (; it; it = it->next) { + if (it->locked && insn->op != OP_LOAD) + continue; + if ((it->offset >> 4) != (sym->reg.data.offset >> 4) || + it->rel[0] != insn->getIndirect(0, 0) || + it->fileIndex != sym->reg.fileIndex || + it->rel[1] != insn->getIndirect(0, 1)) + continue; + + if (it->offset < sym->reg.data.offset) { + if (it->offset + it->size >= sym->reg.data.offset) { + isAdj = (it->offset + it->size == sym->reg.data.offset); + if (!isAdj) + return it; + if (!(it->offset & 0x7)) + rec = it; + } + } else { + isAdj = it->offset != sym->reg.data.offset; + if (size <= it->size && !isAdj) + return it; + else + if (!(sym->reg.data.offset & 0x7)) + if (it->offset - size <= sym->reg.data.offset) + rec = it; + } + } + return rec; +} + +bool +MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec) +{ + Instruction *st = rec->insn; + int32_t offSt = rec->offset; + int32_t offLd = ld->getSrc(0)->reg.data.offset; + int d, s; + + for (s = 1; offSt != offLd && st->srcExists(s); ++s) + offSt += st->getSrc(s)->reg.size; + if (offSt != offLd) + return false; + + for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) { + if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size) + return false; + if (st->getSrc(s)->reg.file != FILE_GPR) + return false; + ld->def[d].replace(st->getSrc(s), false); + } + ld->bb->remove(ld); + return true; +} + +bool +MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec) +{ + Instruction *ldR = rec->insn; + int32_t offR = rec->offset; + int32_t offE = ldE->getSrc(0)->reg.data.offset; + int dR, dE; + + assert(offR <= offE); + for (dR = 0; offR < offE && ldR->defExists(dR); ++dR) + offR += ldR->getDef(dR)->reg.size; + if (offR != offE) + return false; + + for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) { + if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size) + return false; + ldE->def[dE].replace(ldR->getDef(dR), false); + } + + delete_Instruction(prog, ldE); + return true; +} + +bool +MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec) +{ + const Instruction *const ri = rec->insn; + Value *extra[3]; + + int32_t offS = st->getSrc(0)->reg.data.offset; + int32_t offR = rec->offset; + int32_t endS = offS + typeSizeof(st->dType); + int32_t endR = offR + typeSizeof(ri->dType); + + rec->size = MAX2(endS, endR) - MIN2(offS, offR); + + st->takeExtraSources(0, extra); + + if (offR < offS) { + Value *vals[4]; + int s, n; + int k = 0; + // get non-replaced sources of ri + for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s) + vals[k++] = ri->getSrc(s); + n = s; + // get replaced sources of st + for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s) + vals[k++] = st->getSrc(s); + // skip replaced sources of ri + for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s); + // get non-replaced sources after values covered by st + for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s) + vals[k++] = ri->getSrc(s); + for (s = 0; s < k; ++s) + st->setSrc(s + 1, vals[s]); + st->setSrc(0, ri->getSrc(0)); + } else + if (endR > endS) { + int j, s; + for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size); + for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size); + for (; offR < endR; offR += ri->getSrc(j++)->reg.size) + st->setSrc(s++, ri->getSrc(j)); + } + st->putExtraSources(0, extra); + + delete_Instruction(prog, rec->insn); + + rec->insn = st; + rec->offset = st->getSrc(0)->reg.data.offset; + + st->setType(typeOfSize(rec->size)); + + return true; +} + +bool +MemoryOpt::Record::overlaps(const Instruction *ldst) const +{ + Record that; + that.set(ldst); + + if (this->fileIndex != that.fileIndex) + return false; + + if (this->rel[0] || that.rel[0]) + return this->base == that.base; + return + (this->offset < that.offset + that.size) && + (this->offset + this->size > that.offset); +} + +// We must not eliminate stores that affect the result of @ld if +// we find later stores to the same location, and we may no longer +// merge them with later stores. +// The stored value can, however, still be used to determine the value +// returned by future loads. +void +MemoryOpt::lockStores(Instruction *const ld) +{ + for (Record *r = stores[ld->src[0].getFile()]; r; r = r->next) + if (!r->locked && r->overlaps(ld)) + r->locked = true; +} + +// Prior loads from the location of @st are no longer valid. +// Stores to the location of @st may no longer be used to derive +// the value at it nor be coalesced into later stores. +void +MemoryOpt::purgeRecords(Instruction *const st, DataFile f) +{ + if (st) + f = st->src[0].getFile(); + + for (Record *r = loads[f]; r; r = r->next) + if (!st || r->overlaps(st)) + r->unlink(&loads[f]); + + for (Record *r = stores[f]; r; r = r->next) + if (!st || r->overlaps(st)) + r->unlink(&stores[f]); +} + +bool +MemoryOpt::visit(BasicBlock *bb) +{ + bool ret = runOpt(bb); + // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st + // where 96 bit memory operations are forbidden. + if (ret) + ret = runOpt(bb); + return ret; +} + +bool +MemoryOpt::runOpt(BasicBlock *bb) +{ + Instruction *ldst, *next; + Record *rec; + bool isAdjacent = true; + + for (ldst = bb->getEntry(); ldst; ldst = next) { + bool keep = true; + bool isLoad = true; + next = ldst->next; + + if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) { + if (ldst->isDead()) { + // might have been produced by earlier optimization + delete_Instruction(prog, ldst); + continue; + } + } else + if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) { + isLoad = false; + } else { + // TODO: maybe have all fixed ops act as barrier ? + if (ldst->op == OP_CALL) { + purgeRecords(NULL, FILE_MEMORY_LOCAL); + purgeRecords(NULL, FILE_MEMORY_GLOBAL); + purgeRecords(NULL, FILE_MEMORY_SHARED); + purgeRecords(NULL, FILE_SHADER_OUTPUT); + } else + if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) { + purgeRecords(NULL, FILE_SHADER_OUTPUT); + } + continue; + } + if (ldst->getPredicate()) // TODO: handle predicated ld/st + continue; + + if (isLoad) { + DataFile file = ldst->src[0].getFile(); + + // if ld l[]/g[] look for previous store to eliminate the reload + if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) { + // TODO: shared memory ? + rec = findRecord(ldst, false, isAdjacent); + if (rec && !isAdjacent) + keep = !replaceLdFromSt(ldst, rec); + } + + // or look for ld from the same location and replace this one + rec = keep ? findRecord(ldst, true, isAdjacent) : NULL; + if (rec) { + if (!isAdjacent) + keep = !replaceLdFromLd(ldst, rec); + else + // or combine a previous load with this one + keep = !combineLd(rec, ldst); + } + if (keep) + lockStores(ldst); + } else { + rec = findRecord(ldst, false, isAdjacent); + if (rec) { + if (!isAdjacent) + keep = !replaceStFromSt(ldst, rec); + else + keep = !combineSt(rec, ldst); + } + if (keep) + purgeRecords(ldst, DATA_FILE_COUNT); + } + if (keep) + addRecord(ldst); + } + reset(); + + return true; +} + +// ============================================================================= + +// Turn control flow into predicated instructions (after register allocation !). +// TODO: +// Could move this to before register allocation on NVC0 and also handle nested +// constructs. +class FlatteningPass : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + bool tryPredicateConditional(BasicBlock *); + void predicateInstructions(BasicBlock *, Value *pred, CondCode cc); + void tryPropagateBranch(BasicBlock *); + inline bool isConstantCondition(Value *pred); + inline bool mayPredicate(const Instruction *, const Value *pred) const; + inline void removeFlow(Instruction *); +}; + +bool +FlatteningPass::isConstantCondition(Value *pred) +{ + Instruction *insn = pred->getUniqueInsn(); + assert(insn); + if (insn->op != OP_SET || insn->srcExists(2)) + return false; + + for (int s = 0; s < 2 && insn->srcExists(s); ++s) { + Instruction *ld = insn->getSrc(s)->getUniqueInsn(); + DataFile file; + if (ld) { + if (ld->op != OP_MOV && ld->op != OP_LOAD) + return false; + if (ld->src[0].isIndirect(0)) + return false; + file = ld->src[0].getFile(); + } else { + file = insn->src[s].getFile(); + // catch $r63 on NVC0 + if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR) + file = FILE_IMMEDIATE; + } + if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST) + return false; + } + return true; +} + +void +FlatteningPass::removeFlow(Instruction *insn) +{ + FlowInstruction *term = insn ? insn->asFlow() : NULL; + if (!term) + return; + Graph::Edge::Type ty = term->bb->cfg.outgoing().getType(); + + if (term->op == OP_BRA) { + // TODO: this might get more difficult when we get arbitrary BRAs + if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK) + return; + } else + if (term->op != OP_JOIN) + return; + + delete_Instruction(prog, term); + + Value *pred = term->getPredicate(); + + if (pred && pred->refCount() == 0) { + Instruction *pSet = pred->getUniqueInsn(); + pred->join->reg.data.id = -1; // deallocate + if (pSet->isDead()) + delete_Instruction(prog, pSet); + } +} + +void +FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc) +{ + for (Instruction *i = bb->getEntry(); i; i = i->next) { + if (i->isNop()) + continue; + assert(!i->getPredicate()); + i->setPredicate(cc, pred); + } + removeFlow(bb->getExit()); +} + +bool +FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const +{ + if (insn->isPseudo()) + return true; + // TODO: calls where we don't know which registers are modified + + if (!prog->getTarget()->mayPredicate(insn, pred)) + return false; + for (int d = 0; insn->defExists(d); ++d) + if (insn->getDef(d)->equals(pred)) + return false; + return true; +} + +// If we conditionally skip over or to a branch instruction, replace it. +// NOTE: We do not update the CFG anymore here ! +void +FlatteningPass::tryPropagateBranch(BasicBlock *bb) +{ + BasicBlock *bf = NULL; + unsigned int i; + + if (bb->cfg.outgoingCount() != 2) + return; + if (!bb->getExit() || bb->getExit()->op != OP_BRA) + return; + Graph::EdgeIterator ei = bb->cfg.outgoing(); + + for (i = 0; !ei.end(); ++i, ei.next()) { + bf = BasicBlock::get(ei.getNode()); + if (bf->getInsnCount() == 1) + break; + } + if (ei.end() || !bf->getExit()) + return; + FlowInstruction *bra = bb->getExit()->asFlow(); + FlowInstruction *rep = bf->getExit()->asFlow(); + + if (rep->getPredicate()) + return; + if (rep->op != OP_BRA && + rep->op != OP_JOIN && + rep->op != OP_EXIT) + return; + + bra->op = rep->op; + bra->target.bb = rep->target.bb; + if (i) // 2nd out block means branch not taken + bra->cc = inverseCondCode(bra->cc); + bf->remove(rep); +} + +bool +FlatteningPass::visit(BasicBlock *bb) +{ + if (tryPredicateConditional(bb)) + return true; + + // try to attach join to previous instruction + Instruction *insn = bb->getExit(); + if (insn && insn->op == OP_JOIN && !insn->getPredicate()) { + insn = insn->prev; + if (insn && !insn->getPredicate() && !insn->asFlow() && !insn->isNop()) { + insn->join = 1; + bb->remove(bb->getExit()); + return true; + } + } + + tryPropagateBranch(bb); + + return true; +} + +bool +FlatteningPass::tryPredicateConditional(BasicBlock *bb) +{ + BasicBlock *bL = NULL, *bR = NULL; + unsigned int nL = 0, nR = 0, limit = 12; + Instruction *insn; + unsigned int mask; + + mask = bb->initiatesSimpleConditional(); + if (!mask) + return false; + + assert(bb->getExit()); + Value *pred = bb->getExit()->getPredicate(); + assert(pred); + + if (isConstantCondition(pred)) + limit = 4; + + Graph::EdgeIterator ei = bb->cfg.outgoing(); + + if (mask & 1) { + bL = BasicBlock::get(ei.getNode()); + for (insn = bL->getEntry(); insn; insn = insn->next, ++nL) + if (!mayPredicate(insn, pred)) + return false; + if (nL > limit) + return false; // too long, do a real branch + } + ei.next(); + + if (mask & 2) { + bR = BasicBlock::get(ei.getNode()); + for (insn = bR->getEntry(); insn; insn = insn->next, ++nR) + if (!mayPredicate(insn, pred)) + return false; + if (nR > limit) + return false; // too long, do a real branch + } + + if (bL) + predicateInstructions(bL, pred, bb->getExit()->cc); + if (bR) + predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc)); + + if (bb->joinAt) { + bb->remove(bb->joinAt); + bb->joinAt = NULL; + } + removeFlow(bb->getExit()); // delete the branch/join at the fork point + + // remove potential join operations at the end of the conditional + if (prog->getTarget()->joinAnterior) { + bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode()); + if (bb->getEntry() && bb->getEntry()->op == OP_JOIN) + removeFlow(bb->getEntry()); + } + + return true; +} + +// ============================================================================= + +// Common subexpression elimination. Stupid O^2 implementation. +class LocalCSE : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + inline bool tryReplace(Instruction **, Instruction *); + + DLList ops[OP_LAST + 1]; +}; + +class GlobalCSE : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +bool +Instruction::isActionEqual(const Instruction *that) const +{ + if (this->op != that->op || + this->dType != that->dType || + this->sType != that->sType) + return false; + if (this->cc != that->cc) + return false; + + if (this->asTex()) { + if (memcmp(&this->asTex()->tex, + &that->asTex()->tex, + sizeof(this->asTex()->tex))) + return false; + } else + if (this->asCmp()) { + if (this->asCmp()->setCond != that->asCmp()->setCond) + return false; + } else + if (this->asFlow()) { + return false; + } else { + if (this->atomic != that->atomic || + this->ipa != that->ipa || + this->lanes != that->lanes || + this->perPatch != that->perPatch) + return false; + if (this->postFactor != that->postFactor) + return false; + } + + if (this->subOp != that->subOp || + this->saturate != that->saturate || + this->rnd != that->rnd || + this->ftz != that->ftz || + this->dnz != that->dnz || + this->cache != that->cache) + return false; + + return true; +} + +bool +Instruction::isResultEqual(const Instruction *that) const +{ + unsigned int d, s; + + // NOTE: location of discard only affects tex with liveOnly and quadops + if (!this->defExists(0) && this->op != OP_DISCARD) + return false; + + if (!isActionEqual(that)) + return false; + + if (this->predSrc != that->predSrc) + return false; + + for (d = 0; this->defExists(d); ++d) { + if (!that->defExists(d) || + !this->getDef(d)->equals(that->getDef(d), false)) + return false; + } + if (that->defExists(d)) + return false; + + for (s = 0; this->srcExists(s); ++s) { + if (!that->srcExists(s)) + return false; + if (this->src[s].mod != that->src[s].mod) + return false; + if (!this->getSrc(s)->equals(that->getSrc(s), true)) + return false; + } + if (that->srcExists(s)) + return false; + + if (op == OP_LOAD || op == OP_VFETCH) { + switch (src[0].getFile()) { + case FILE_MEMORY_CONST: + case FILE_SHADER_INPUT: + return true; + default: + return false; + } + } + + return true; +} + +// pull through common expressions from different in-blocks +bool +GlobalCSE::visit(BasicBlock *bb) +{ + Instruction *phi, *next, *ik; + int s; + + for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) { + next = phi->next; + if (phi->getSrc(0)->refCount() > 1) + continue; + ik = phi->getSrc(0)->getInsn(); + for (s = 1; phi->srcExists(s); ++s) { + if (phi->getSrc(s)->refCount() > 1) + break; + if (!phi->getSrc(s)->getInsn()->isResultEqual(ik)) + break; + } + if (!phi->srcExists(s)) { + Instruction *entry = bb->getEntry(); + ik->bb->remove(ik); + if (!entry || entry->op != OP_JOIN) + bb->insertHead(ik); + else + bb->insertAfter(entry, ik); + ik->setDef(0, phi->getDef(0)); + delete_Instruction(prog, phi); + } + } + + return true; +} + +bool +LocalCSE::tryReplace(Instruction **ptr, Instruction *i) +{ + Instruction *old = *ptr; + if (!old->isResultEqual(i)) + return false; + for (int d = 0; old->defExists(d); ++d) + old->def[d].replace(i->getDef(d), false); + delete_Instruction(prog, old); + *ptr = NULL; + return true; +} + +bool +LocalCSE::visit(BasicBlock *bb) +{ + unsigned int replaced; + + do { + Instruction *ir, *next; + + replaced = 0; + + // will need to know the order of instructions + int serial = 0; + for (ir = bb->getEntry(); ir; ir = ir->next) + ir->serial = serial++; + + for (ir = bb->getEntry(); ir; ir = next) { + int s; + Value *src = NULL; + + next = ir->next; + + if (ir->fixed) { + ops[ir->op].insert(ir); + continue; + } + + for (s = 0; ir->srcExists(s); ++s) + if (ir->getSrc(s)->asLValue()) + if (!src || ir->getSrc(s)->refCount() < src->refCount()) + src = ir->getSrc(s); + + if (src) { + for (ValueRef::Iterator refs = src->uses->iterator(); !refs.end(); + refs.next()) { + Instruction *ik = refs.get()->getInsn(); + if (ik->serial < ir->serial && ik->bb == ir->bb) + if (tryReplace(&ir, ik)) + break; + } + } else { + DLLIST_FOR_EACH(&ops[ir->op], iter) + { + Instruction *ik = reinterpret_cast<Instruction *>(iter.get()); + if (tryReplace(&ir, ik)) + break; + } + } + + if (ir) + ops[ir->op].insert(ir); + else + ++replaced; + } + for (unsigned int i = 0; i <= OP_LAST; ++i) + ops[i].clear(); + + } while (replaced); + + return true; +} + +// ============================================================================= + +// Remove computations of unused values. +class DeadCodeElim : public Pass +{ +public: + bool buryAll(Program *); + +private: + virtual bool visit(BasicBlock *); + + void checkSplitLoad(Instruction *ld); // for partially dead loads + + unsigned int deadCount; +}; + +bool +DeadCodeElim::buryAll(Program *prog) +{ + do { + deadCount = 0; + if (!this->run(prog, false, false)) + return false; + } while (deadCount); + + return true; +} + +bool +DeadCodeElim::visit(BasicBlock *bb) +{ + Instruction *next; + + for (Instruction *i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->isDead()) { + ++deadCount; + delete_Instruction(prog, i); + } else + if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) { + checkSplitLoad(i); + } + } + return true; +} + +void +DeadCodeElim::checkSplitLoad(Instruction *ld1) +{ + Instruction *ld2 = NULL; // can get at most 2 loads + Value *def1[4]; + Value *def2[4]; + int32_t addr1, addr2; + int32_t size1, size2; + int d, n1, n2; + uint32_t mask = 0xffffffff; + + for (d = 0; ld1->defExists(d); ++d) + if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0) + mask &= ~(1 << d); + if (mask == 0xffffffff) + return; + + addr1 = ld1->getSrc(0)->reg.data.offset; + n1 = n2 = 0; + size1 = size2 = 0; + for (d = 0; ld1->defExists(d); ++d) { + if (mask & (1 << d)) { + if (size1 && (addr1 & 0x7)) + break; + def1[n1] = ld1->getDef(d); + size1 += def1[n1++]->reg.size; + } else + if (!n1) { + addr1 += ld1->getDef(d)->reg.size; + } else { + break; + } + } + for (addr2 = addr1 + size1; ld1->defExists(d); ++d) { + if (mask & (1 << d)) { + def2[n2] = ld1->getDef(d); + size2 += def2[n2++]->reg.size; + } else { + assert(!n2); + addr2 += ld1->getDef(d)->reg.size; + } + } + + updateLdStOffset(ld1, addr1, func); + ld1->setType(typeOfSize(size1)); + for (d = 0; d < 4; ++d) + ld1->setDef(d, (d < n1) ? def1[d] : NULL); + + if (!n2) + return; + + ld2 = ld1->clone(false); + updateLdStOffset(ld2, addr2, func); + ld2->setType(typeOfSize(size2)); + for (d = 0; d < 4; ++d) + ld2->setDef(d, (d < n2) ? def2[d] : NULL); + + ld1->bb->insertAfter(ld1, ld2); +} + +// ============================================================================= + +#define RUN_PASS(l, n, f) \ + if (level >= (l)) { \ + if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \ + INFO("PEEPHOLE: %s\n", #n); \ + n pass; \ + if (!pass.f(this)) \ + return false; \ + } + +bool +Program::optimizeSSA(int level) +{ + RUN_PASS(1, DeadCodeElim, buryAll); + RUN_PASS(1, CopyPropagation, run); + RUN_PASS(2, GlobalCSE, run); + RUN_PASS(1, LocalCSE, run); + RUN_PASS(2, AlgebraicOpt, run); + RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks + RUN_PASS(1, ConstantFolding, foldAll); + RUN_PASS(1, LoadPropagation, run); + RUN_PASS(2, MemoryOpt, run); + RUN_PASS(2, LocalCSE, run); + RUN_PASS(0, DeadCodeElim, buryAll); + return true; +} + +bool +Program::optimizePostRA(int level) +{ + RUN_PASS(2, FlatteningPass, run); + return true; +} + +} |