diff options
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp | 91 | ||||
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp | 2 |
2 files changed, 82 insertions, 11 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index b17d57d0bfd..0fb76663ffe 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -37,18 +37,25 @@ namespace nv50_ir { // ah*bl 00 // // fffe0001 + fffe0001 +// +// Note that this sort of splitting doesn't work for signed values, so we +// compute the sign on those manually and then perform an unsigned multiply. static bool expandIntegerMUL(BuildUtil *bld, Instruction *mul) { const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; - DataType fTy = mul->sType; // full type - DataType hTy; + DataType fTy; // full type + switch (mul->sType) { + case TYPE_S32: fTy = TYPE_U32; break; + case TYPE_S64: fTy = TYPE_U64; break; + default: fTy = mul->sType; break; + } + + DataType hTy; // half type switch (fTy) { - case TYPE_S32: hTy = TYPE_S16; break; case TYPE_U32: hTy = TYPE_U16; break; case TYPE_U64: hTy = TYPE_U32; break; - case TYPE_S64: hTy = TYPE_S32; break; default: return false; } @@ -59,15 +66,25 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) bld->setPosition(mul, true); + Value *s[2]; Value *a[2], *b[2]; - Value *c[2]; Value *t[4]; for (int j = 0; j < 4; ++j) t[j] = bld->getSSA(fullSize); + s[0] = mul->getSrc(0); + s[1] = mul->getSrc(1); + + if (isSignedType(mul->sType)) { + s[0] = bld->getSSA(fullSize); + s[1] = bld->getSSA(fullSize); + bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); + bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1)); + } + // split sources into halves - i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0)); - i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1)); + i[0] = bld->mkSplit(a, halfSize, s[0]); + i[1] = bld->mkSplit(b, halfSize, s[1]); i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); @@ -75,24 +92,76 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); if (highResult) { - Value *r[4]; + Value *c[2]; + Value *r[5]; Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); c[0] = bld->getSSA(1, FILE_FLAGS); c[1] = bld->getSSA(1, FILE_FLAGS); - for (int j = 0; j < 4; ++j) + for (int j = 0; j < 5; ++j) r[j] = bld->getSSA(fullSize); i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]); bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]); - i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]); + i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]); // set carry defs / sources i[3]->setFlagsDef(1, c[0]); - i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry + // actual result required in negative case, but ignored for + // unsigned. for some reason the compiler ends up dropping the whole + // instruction if the destination is unused but the flags are. + if (isSignedType(mul->sType)) + i[4]->setFlagsDef(1, c[1]); + else + i[4]->setFlagsDef(0, c[1]); i[6]->setPredicate(CC_C, c[0]); i[5]->setFlagsSrc(3, c[1]); + + if (isSignedType(mul->sType)) { + Value *cc[2]; + Value *rr[7]; + Value *one = bld->getSSA(fullSize); + bld->loadImm(one, 1); + for (int j = 0; j < 7; j++) + rr[j] = bld->getSSA(fullSize); + + // NOTE: this logic uses predicates because splitting basic blocks is + // ~impossible during the SSA phase. The RA relies on a correlation + // between edge order and phi node sources. + + // Set the sign of the result based on the inputs + bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1)) + ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS))); + + // 1s complement of 64-bit value + bld->mkOp1(OP_NOT, fTy, rr[0], r[4]) + ->setPredicate(CC_S, cc[0]); + bld->mkOp1(OP_NOT, fTy, rr[1], t[3]) + ->setPredicate(CC_S, cc[0]); + + // add to low 32-bits, keep track of the carry + Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one); + n->setPredicate(CC_S, cc[0]); + n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS))); + + // If there was a carry, add 1 to the upper 32 bits + // XXX: These get executed even if they shouldn't be + bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one) + ->setPredicate(CC_C, cc[1]); + bld->mkMov(rr[3], rr[0]) + ->setPredicate(CC_NC, cc[1]); + bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]); + + // Merge the results from the negative and non-negative paths + bld->mkMov(rr[5], rr[4]) + ->setPredicate(CC_S, cc[0]); + bld->mkMov(rr[6], r[4]) + ->setPredicate(CC_NS, cc[0]); + bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]); + } else { + bld->mkMov(mul->getDef(0), r[4]); + } } else { bld->mkMov(mul->getDef(0), t[3]); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 799ac2fd2ab..abadc7fb4e4 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -332,6 +332,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, return false; if (sf == FILE_IMMEDIATE) return false; + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH && sf == FILE_MEMORY_CONST) + return false; ldSize = 2; } else { ldSize = typeSizeof(ld->dType); |