3 files changed, 172 insertions, 38 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 3ea0734331d..3de46287f70 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1091,6 +1091,156 @@ lp_build_mul(struct lp_build_context *bld,
    return res;
 }
 
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+    * for x86 simd is atrocious (even if the high bits weren't required),
+    * trying to handle real 64bit inputs (which of course can't happen due
+    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+    * apparently llvm does not recognize this widening mul). This includes 6
+    * (instead of 2) pmuludq plus extra adds and shifts
+    * The same story applies to signed mul, albeit fixing this requires sse41.
+    * https://llvm.org/bugs/show_bug.cgi?id=30845
+    * So, whip up our own code, albeit only for length 4 and 8 (which
+    * should be good enough)...
+    */
+   if ((bld->type.length == 4 || bld->type.length == 8) &&
+       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+        util_cpu_caps.has_sse4_1)) {
+      const char *intrinsic = NULL;
+      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+      struct lp_type type_wide = lp_wider_type(bld->type);
+      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+      unsigned i;
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i+1);
+         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      aeven = a;
+      beven = b;
+      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.avx2.pmul.dq";
+         } else {
+            intrinsic = "llvm.x86.avx2.pmulu.dq";
+         }
+         muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                             wider_type, aeven, beven);
+         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                            wider_type, aodd, bodd);
+      }
+      else {
+         /* for consistent naming look elsewhere... */
+         if (bld->type.sign) {
+            intrinsic = "llvm.x86.sse41.pmuldq";
+         } else {
+            intrinsic = "llvm.x86.sse2.pmulu.dq";
+         }
+         /*
+          * XXX If we only have AVX but not AVX2 this is a pain.
+          * lp_build_intrinsic_binary_anylength() can't handle it
+          * (due to src and dst type not being identical).
+          */
+         if (bld->type.length == 8) {
+            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+            LLVMValueRef muleven2[2], mulodd2[2];
+            struct lp_type type_wide_half = type_wide;
+            LLVMTypeRef wtype_half;
+            type_wide_half.length = 2;
+            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
+            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
+            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
+            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenlo, bevenlo);
+            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddlo, boddlo);
+            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                    wtype_half, aevenhi, bevenhi);
+            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                   wtype_half, aoddhi, boddhi);
+            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
+            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
+
+         }
+         else {
+            muleven = lp_build_intrinsic_binary(builder, intrinsic,
+                                                wider_type, aeven, beven);
+            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+                                               wider_type, aodd, bodd);
+         }
+      }
+      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
+      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i + 1);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+
+      for (i = 0; i < bld->type.length; i += 2) {
+         shuf[i] = lp_build_const_int32(gallivm, i);
+         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
+      }
+      shuf_vec = LLVMConstVector(shuf, bld->type.length);
+      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
+   }
+   else {
+      LLVMValueRef tmp;
+      struct lp_type type_tmp;
+      LLVMTypeRef wide_type, cast_type;
+
+      type_tmp = bld->type;
+      type_tmp.width *= 2;
+      wide_type = lp_build_vec_type(gallivm, type_tmp);
+      type_tmp = bld->type;
+      type_tmp.length *= 2;
+      cast_type = lp_build_vec_type(gallivm, type_tmp);
+
+      if (bld->type.sign) {
+         a = LLVMBuildSExt(builder, a, wide_type, "");
+         b = LLVMBuildSExt(builder, b, wide_type, "");
+      } else {
+         a = LLVMBuildZExt(builder, a, wide_type, "");
+         b = LLVMBuildZExt(builder, b, wide_type, "");
+      }
+      tmp = LLVMBuildMul(builder, a, b, "");
+      tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
+      *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
+      return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
+   }
+}
+
 
 /* a * b + c */
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 622b930a937..5d48b1c837d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -77,6 +77,12 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     LLVMValueRef *res_hi);
+
+LLVMValueRef
 lp_build_mul_imm(struct lp_build_context *bld,
                  LLVMValueRef a,
                  int b);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 2e837afe290..72d4579a042 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -842,26 +842,15 @@ imul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *int_bld = &bld_base->int_bld;
-   struct lp_type type = int_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildSExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildSExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildAShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(int_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(int_bld, emit_data->args[0],
+                        emit_data->args[0], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_UMUL_HI */
@@ -871,26 +860,15 @@ umul_hi_emit(
    struct lp_build_tgsi_context * bld_base,
    struct lp_build_emit_data * emit_data)
 {
-   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
    struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   struct lp_type type = uint_bld->type;
-   LLVMValueRef src0, src1;
-   LLVMValueRef dst64;
-   LLVMTypeRef typeRef;
-
-   assert(type.width == 32);
-   type.width = 64;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   src0 = LLVMBuildZExt(builder, emit_data->args[0], typeRef, "");
-   src1 = LLVMBuildZExt(builder, emit_data->args[1], typeRef, "");
-   dst64 = LLVMBuildMul(builder, src0, src1, "");
-   dst64 = LLVMBuildLShr(
-            builder, dst64,
-            lp_build_const_vec(bld_base->base.gallivm, type, 32), "");
-   type.width = 32;
-   typeRef = lp_build_vec_type(bld_base->base.gallivm, type);
-   emit_data->output[emit_data->chan] =
-         LLVMBuildTrunc(builder, dst64, typeRef, "");
+   LLVMValueRef hi_bits;
+
+   assert(uint_bld->type.width == 32);
+
+   /* low result bits are tossed away */
+   lp_build_mul_32_lohi(uint_bld, emit_data->args[0],
+                        emit_data->args[0], &hi_bits);
+   emit_data->output[emit_data->chan] = hi_bits;
 }
 
 /* TGSI_OPCODE_MAX */