aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp91
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp2
2 files changed, 82 insertions, 11 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index b17d57d0bfd..0fb76663ffe 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -37,18 +37,25 @@ namespace nv50_ir {
// ah*bl 00
//
// fffe0001 + fffe0001
+//
+// Note that this sort of splitting doesn't work for signed values, so we
+// compute the sign on those manually and then perform an unsigned multiply.
static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
- DataType fTy = mul->sType; // full type
- DataType hTy;
+ DataType fTy; // full type
+ switch (mul->sType) {
+ case TYPE_S32: fTy = TYPE_U32; break;
+ case TYPE_S64: fTy = TYPE_U64; break;
+ default: fTy = mul->sType; break;
+ }
+
+ DataType hTy; // half type
switch (fTy) {
- case TYPE_S32: hTy = TYPE_S16; break;
case TYPE_U32: hTy = TYPE_U16; break;
case TYPE_U64: hTy = TYPE_U32; break;
- case TYPE_S64: hTy = TYPE_S32; break;
default:
return false;
}
@@ -59,15 +66,25 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
bld->setPosition(mul, true);
+ Value *s[2];
Value *a[2], *b[2];
- Value *c[2];
Value *t[4];
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
+ s[0] = mul->getSrc(0);
+ s[1] = mul->getSrc(1);
+
+ if (isSignedType(mul->sType)) {
+ s[0] = bld->getSSA(fullSize);
+ s[1] = bld->getSSA(fullSize);
+ bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
+ bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+ }
+
// split sources into halves
- i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
- i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+ i[0] = bld->mkSplit(a, halfSize, s[0]);
+ i[1] = bld->mkSplit(b, halfSize, s[1]);
i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@@ -75,24 +92,76 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
if (highResult) {
- Value *r[4];
+ Value *c[2];
+ Value *r[5];
Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
c[0] = bld->getSSA(1, FILE_FLAGS);
c[1] = bld->getSSA(1, FILE_FLAGS);
- for (int j = 0; j < 4; ++j)
+ for (int j = 0; j < 5; ++j)
r[j] = bld->getSSA(fullSize);
i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
- i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+ i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
// set carry defs / sources
i[3]->setFlagsDef(1, c[0]);
- i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+ // actual result required in negative case, but ignored for
+ // unsigned. for some reason the compiler ends up dropping the whole
+ // instruction if the destination is unused but the flags are.
+ if (isSignedType(mul->sType))
+ i[4]->setFlagsDef(1, c[1]);
+ else
+ i[4]->setFlagsDef(0, c[1]);
i[6]->setPredicate(CC_C, c[0]);
i[5]->setFlagsSrc(3, c[1]);
+
+ if (isSignedType(mul->sType)) {
+ Value *cc[2];
+ Value *rr[7];
+ Value *one = bld->getSSA(fullSize);
+ bld->loadImm(one, 1);
+ for (int j = 0; j < 7; j++)
+ rr[j] = bld->getSSA(fullSize);
+
+ // NOTE: this logic uses predicates because splitting basic blocks is
+ // ~impossible during the SSA phase. The RA relies on a correlation
+ // between edge order and phi node sources.
+
+ // Set the sign of the result based on the inputs
+ bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
+ ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
+
+ // 1s complement of 64-bit value
+ bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
+ ->setPredicate(CC_S, cc[0]);
+ bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
+ ->setPredicate(CC_S, cc[0]);
+
+ // add to low 32-bits, keep track of the carry
+ Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
+ n->setPredicate(CC_S, cc[0]);
+ n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
+
+ // If there was a carry, add 1 to the upper 32 bits
+ // XXX: These get executed even if they shouldn't be
+ bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
+ ->setPredicate(CC_C, cc[1]);
+ bld->mkMov(rr[3], rr[0])
+ ->setPredicate(CC_NC, cc[1]);
+ bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
+
+ // Merge the results from the negative and non-negative paths
+ bld->mkMov(rr[5], rr[4])
+ ->setPredicate(CC_S, cc[0]);
+ bld->mkMov(rr[6], r[4])
+ ->setPredicate(CC_NS, cc[0]);
+ bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
+ } else {
+ bld->mkMov(mul->getDef(0), r[4]);
+ }
} else {
bld->mkMov(mul->getDef(0), t[3]);
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 799ac2fd2ab..abadc7fb4e4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -332,6 +332,8 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
return false;
if (sf == FILE_IMMEDIATE)
return false;
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH && sf == FILE_MEMORY_CONST)
+ return false;
ldSize = 2;
} else {
ldSize = typeSizeof(ld->dType);