summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2017-12-29 14:38:55 -0800
committerJason Ekstrand <[email protected]>2018-12-13 17:49:48 +0000
commit39198a1238c967f17a6c3d701fe215315ecbab69 (patch)
tree51b7d8d4cf1aa9f6c395ce921197f7e8bf86d517
parent9525971e2bec379bdbc187bfd325e7f6ded01eb5 (diff)
nir/lower_int64: Add support for [iu]mul_high
Reviewed-by: Ian Romanick [email protected]
-rw-r--r--src/compiler/nir/nir.h2
-rw-r--r--src/compiler/nir/nir_lower_int64.c65
2 files changed, 67 insertions, 0 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 5d9c96fe11e..a2c68d66aea 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3106,6 +3106,8 @@ typedef enum {
nir_lower_isign64 = (1 << 1),
/** Lower all int64 modulus and division opcodes */
nir_lower_divmod64 = (1 << 2),
+ /** Lower all 64-bit umul_high and imul_high opcodes */
+ nir_lower_imul_high64 = (1 << 3),
} nir_lower_int64_options;
bool nir_lower_int64(nir_shader *shader, nir_lower_int64_options options);
diff --git a/src/compiler/nir/nir_lower_int64.c b/src/compiler/nir/nir_lower_int64.c
index 81669c02cc6..2a9ea3e1bdd 100644
--- a/src/compiler/nir/nir_lower_int64.c
+++ b/src/compiler/nir/nir_lower_int64.c
@@ -41,6 +41,64 @@ lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
}
static nir_ssa_def *
+lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
+ bool sign_extend)
+{
+ nir_ssa_def *x32[4], *y32[4];
+ x32[0] = nir_unpack_64_2x32_split_x(b, x);
+ x32[1] = nir_unpack_64_2x32_split_y(b, x);
+ if (sign_extend) {
+ x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31));
+ } else {
+ x32[2] = x32[3] = nir_imm_int(b, 0);
+ }
+
+ y32[0] = nir_unpack_64_2x32_split_x(b, y);
+ y32[1] = nir_unpack_64_2x32_split_y(b, y);
+ if (sign_extend) {
+ y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31));
+ } else {
+ y32[2] = y32[3] = nir_imm_int(b, 0);
+ }
+
+ nir_ssa_def *res[8] = { NULL, };
+
+ /* Yes, the following generates a pile of code. However, we throw res[0]
+ * and res[1] away in the end and, if we're in the umul case, four of our
+ * eight dword operands will be constant zero and opt_algebraic will clean
+ * this up nicely.
+ */
+ for (unsigned i = 0; i < 4; i++) {
+ nir_ssa_def *carry = NULL;
+ for (unsigned j = 0; j < 4; j++) {
+ /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
+ * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum
+ * value that will fit in tmp is
+ *
+ * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
+ * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
+ * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
+ *
+ * so we're guaranteed that we can add in two more 32-bit values
+ * without overflowing tmp.
+ */
+ nir_ssa_def *tmp =
+ nir_pack_64_2x32_split(b, nir_imul(b, x32[i], y32[j]),
+ nir_umul_high(b, x32[i], y32[j]));
+ if (res[i + j])
+ tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
+ if (carry)
+ tmp = nir_iadd(b, tmp, carry);
+ res[i + j] = nir_u2u32(b, tmp);
+ carry = nir_ushr(b, tmp, nir_imm_int(b, 32));
+ }
+ res[i + 4] = nir_u2u32(b, carry);
+ }
+
+ return nir_pack_64_2x32_split(b, res[2], res[3]);
+}
+
+static nir_ssa_def *
lower_isign64(nir_builder *b, nir_ssa_def *x)
{
nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
@@ -209,6 +267,9 @@ opcode_to_options_mask(nir_op opcode)
switch (opcode) {
case nir_op_imul:
return nir_lower_imul64;
+ case nir_op_imul_high:
+ case nir_op_umul_high:
+ return nir_lower_imul_high64;
case nir_op_isign:
return nir_lower_isign64;
case nir_op_udiv:
@@ -232,6 +293,10 @@ lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
switch (alu->op) {
case nir_op_imul:
return lower_imul64(b, src[0], src[1]);
+ case nir_op_imul_high:
+ return lower_mul_high64(b, src[0], src[1], true);
+ case nir_op_umul_high:
+ return lower_mul_high64(b, src[0], src[1], false);
case nir_op_isign:
return lower_isign64(b, src[0]);
case nir_op_udiv: