gallivm: optimize gather a bit, by using supplied destination type

By using a dst_type in the the gather interface, gather has some more knowledge about how values should be fetched. E.g. if this is a 3x32bit fetch and dst_type is 4x32bit vector gather will no longer do a ZExt with a 96bit scalar value to 128bit, but just fetch the 96bit as 3x32bit vector (this is still going to be 2 loads of course, but the loads can be done directly to simd vector that way). Also, we can now do some try to use the right int/float type. This should make no difference really since there's typically no domain transition penalties for such simd loads, however it actually makes a difference since llvm will use different shuffle lowering afterwards so the caller can use this to trick llvm into using sane shuffle afterwards (and yes llvm is really stupid there - nothing against using the shuffle instruction from the correct domain, but not at the cost of doing 3 times more shuffles, the case which actually matters is refusal to use shufps for integer values). Also do some attempt to avoid things which look great on paper but llvm doesn't really handle (e.g. fetching 3-element 8 bit and 16 bit vectors which is simply disastrous - I suspect type legalizer is to blame trying to extend these vectors to 128bit types somehow, so fetching these with scalars like before which is suboptimal due to the ZExt). Remove the ability for truncation (no point, this is gather, not conversion) as it is complex enough already. While here also implement not just the float, but also the 64bit avx2 gathers (disabled though since based on the theoretical numbers the benefit just isn't there at all until Skylake at least). Reviewed-by: Jose Fonseca <[email protected]>
author: Roland Scheidegger <[email protected]> 2016-12-11 23:39:22 +0100
committer: Roland Scheidegger <[email protected]> 2016-12-21 04:48:24 +0100
commit: 8bd67a35c50e68c21aed043de11e095c284d151a (patch)
tree: 990f5b573013944b86fca5fe923a9fcc79c88243 /src/gallium/auxiliary/gallivm/lp_bld_gather.c
parent: 5b950319ced820ee112f38f69b5694179c15815d (diff)
1 files changed, 296 insertions, 63 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 1f7ba927bc4..7654ba01d88 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,6 +28,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_const.h"
 #include "lp_bld_format.h"
@@ -36,6 +37,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -114,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
-  if (!aligned) {
+   if (!aligned) {
       LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
    }
 
    assert(src_width <= dst_width);
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
-   } else if (src_width < dst_width) {
+   if (src_width < dst_width) {
       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
       if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -135,28 +152,134 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+                         unsigned length,
+                         unsigned src_width,
+                         LLVMTypeRef src_type,
+                         struct lp_type dst_type,
+                         boolean aligned,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i,
+                         boolean vector_justify)
+{
+   LLVMValueRef ptr, res;
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+   if (!aligned) {
+      LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
+   }
+
+   assert(src_width <= dst_type.width * dst_type.length);
+   if (src_width < dst_type.width * dst_type.length) {
+      if (dst_type.length > 1) {
+         res = lp_build_pad_vector(gallivm, res, dst_type.length);
+         /*
+          * vector_justify hopefully a non-issue since we only deal
+          * with src_width >= 32 here?
+          */
+      } else {
+         /*
+          * Only valid if src_ptr_type is int type...
+          */
+         res = LLVMBuildZExt(gallivm->builder, res,
+                             lp_build_vec_type(gallivm, dst_type), "");
+         if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type,
+                                         dst_type.width - src_width, 0), "");
+#endif
+         }
+      }
+   }
+   return res;
+}
+
+
+
+
 static LLVMValueRef
 lp_build_gather_avx2(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
-                     unsigned dst_width,
+                     struct lp_type dst_type,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
-   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+   LLVMTypeRef src_type, src_vec_type;
    LLVMValueRef res;
+   struct lp_type res_type = dst_type;
+   res_type.length *= length;
 
+   if (dst_type.floating) {
+      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+                                   LLVMFloatTypeInContext(gallivm->context);
+   } else {
+      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+   }
+   src_vec_type = LLVMVectorType(src_type, length);
+
+   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 
    if (0) {
       /*
        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
-       * will not use the AVX2 gather instrinsics.  See
+       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+       * least with Haswell. See
        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+       * And the generated code doing the emulation is quite a bit worse
+       * than what we get by doing it ourselves too.
        */
       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -176,7 +299,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
 
       char intrinsic[64];
-      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+                    length, dst_type.floating ? "f" : "i", src_width);
       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -185,27 +309,36 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    } else {
-      assert(src_width == 32);
-
       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
-      /*
-       * We should get the caller to give more type information so we can use
-       * the intrinsics for the right int/float domain.  Int should be the most
-       * common.
-       */
       const char *intrinsic = NULL;
-      switch (length) {
-      case 4:
-         intrinsic = "llvm.x86.avx2.gather.d.d";
-         break;
-      case 8:
-         intrinsic = "llvm.x86.avx2.gather.d.d.256";
-         break;
-      default:
-         assert(0);
+      unsigned l_idx = 0;
+
+      assert(src_width == 32 || src_width == 64);
+      if (src_width == 32) {
+         assert(length == 4 || length == 8);
+      } else {
+         assert(length == 2 || length == 4);
       }
 
+      static const char *intrinsics[2][2][2] = {
+
+         {{"llvm.x86.avx2.gather.d.d",
+           "llvm.x86.avx2.gather.d.d.256"},
+          {"llvm.x86.avx2.gather.d.q",
+           "llvm.x86.avx2.gather.d.q.256"}},
+
+         {{"llvm.x86.avx2.gather.d.ps",
+           "llvm.x86.avx2.gather.d.ps.256"},
+          {"llvm.x86.avx2.gather.d.pd",
+           "llvm.x86.avx2.gather.d.pd.256"}},
+      };
+
+      if ((src_width == 32 && length == 8) ||
+          (src_width == 64 && length == 4)) {
+         l_idx = 1;
+      }
+      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
+
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
       mask = LLVMConstBitCast(mask, src_vec_type);
@@ -215,12 +348,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    }
-
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
-   } else if (src_width < dst_width) {
-      res = LLVMBuildZExt(builder, res, dst_vec_type, "");
-   }
+   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
 
    return res;
 }
@@ -241,9 +369,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
  *
  * @param length length of the offsets
  * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ *        but truncation is not allowed)
+ *        (this may be a vector, must be pot sized)
  * @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
  */
@@ -251,41 +381,121 @@ LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
 {
    LLVMValueRef res;
+   boolean need_expansion = src_width < dst_type.width * dst_type.length;
+   boolean vec_fetch;
+   struct lp_type fetch_type, fetch_dst_type;
+   LLVMTypeRef src_type;
+
+   assert(src_width <= dst_type.width * dst_type.length);
+
+   /*
+    * This is quite a mess...
+    * Figure out if the fetch should be done as:
+    * a) scalar or vector
+    * b) float or int
+    *
+    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+    * zext will cause extra instructions.
+    * However, the same isn't true for 3x16bit (the codegen for that is
+    * completely worthless on x86 simd, and for 3x8bit is is way worse
+    * still, don't try that... (To get really good code out of llvm for
+    * these cases, the only way is to decompose the fetches manually
+    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+    * case requires sse41, otherwise simple scalar zext is way better.
+    * But probably not important enough, so don't bother.)
+    * Also, we try to honor the floating bit of destination (but isn't
+    * possible if caller asks for instance for 2x32bit dst_type with
+    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+    * cast to 2x32f type, so the fetch is always int and on top of that
+    * we avoid the vec pad and use scalar zext due the above mentioned
+    * issue).
+    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+    * for other archs if necessary...
+    */
+   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+       (dst_type.length > 1)) {
+      /* use vector fetch (if dst_type is vector) */
+      vec_fetch = TRUE;
+      if (dst_type.floating) {
+         fetch_type = lp_type_float_vec(dst_type.width, src_width);
+      } else {
+         fetch_type = lp_type_int_vec(dst_type.width, src_width);
+      }
+      /* intentionally not using lp_build_vec_type here */
+      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+                                fetch_type.length);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.length = dst_type.length;
+    } else {
+      /* use scalar fetch */
+      vec_fetch = FALSE;
+      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+         fetch_type = lp_type_float(src_width);
+      } else {
+         fetch_type = lp_type_int(src_width);
+      }
+      src_type = lp_build_vec_type(gallivm, fetch_type);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.width = dst_type.width * dst_type.length;
+   }
 
    if (length == 1) {
       /* Scalar */
-      return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width, aligned,
-                                  base_ptr, offsets, 0, vector_justify);
-   } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
-      return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+      res = lp_build_gather_elem_vec(gallivm, length,
+                                     src_width, src_type, fetch_dst_type,
+                                     aligned, base_ptr, offsets, 0,
+                                     vector_justify);
+      return LLVMBuildBitCast(gallivm->builder, res,
+                              lp_build_vec_type(gallivm, dst_type), "");
+      /*
+       * Excluding expansion from these paths because if you need it for
+       * 32bit/64bit fetches you're doing it wrong (this is gather, not
+       * conversion) and it would be awkward for floats.
+       */
+   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 32 && (length == 4 || length == 8)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
+   /*
+    * This looks bad on paper wrt throughtput/latency on Haswell.
+    * Even on Broadwell it doesn't look stellar.
+    * Albeit no measurements were done (but tested to work).
+    * Should definitely enable on Skylake.
+    * (In general, should be more of a win if the fetch is 256bit wide -
+    * this is true for the 32bit case above too.)
+    */
+   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 64 && (length == 2 || length == 4)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
    } else {
       /* Vector */
 
-      LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-      LLVMTypeRef gather_vec_type = dst_vec_type;
+      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
       unsigned i;
       boolean vec_zext = FALSE;
-      unsigned gather_width = dst_width;
+      struct lp_type res_type, gather_res_type;
+      LLVMTypeRef res_t, gather_res_t;
 
+      res_type = fetch_dst_type;
+      res_type.length *= length;
+      gather_res_type = res_type;
 
-      if (src_width == 16 && dst_width == 32) {
-         LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2);
-         gather_vec_type = LLVMVectorType(g_elem_type, length);
+      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
          /*
           * Note that llvm is never able to optimize zext/insert combos
           * directly (i.e. zero the simd reg, then place the elements into
-          * the appropriate place directly). And 16->32bit zext simd loads
+          * the appropriate place directly). (I think this has to do with
+          * scalar/vector transition.) And scalar 16->32bit zext simd loads
           * aren't possible (instead loading to scalar reg first).
-          * (I think this has to do with scalar/vector transition.)
           * No idea about other archs...
           * We could do this manually, but instead we just use a vector
           * zext, which is simple enough (and, in fact, llvm might optimize
@@ -293,30 +503,53 @@ lp_build_gather(struct gallivm_state *gallivm,
           * (We're not trying that with other bit widths as that might not be
           * easier, in particular with 8 bit values at least with only sse2.)
           */
+         assert(vec_fetch == FALSE);
+         gather_res_type.width /= 2;
+         fetch_dst_type = fetch_type;
+         src_type = lp_build_vec_type(gallivm, fetch_type);
          vec_zext = TRUE;
-         gather_width = 16;
       }
-      res = LLVMGetUndef(gather_vec_type);
+      res_t = lp_build_vec_type(gallivm, res_type);
+      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+      res = LLVMGetUndef(gather_res_t);
       for (i = 0; i < length; ++i) {
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem;
-         elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, gather_width, aligned,
-                                     base_ptr, offsets, i, vector_justify);
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+         elems[i] = lp_build_gather_elem_vec(gallivm, length,
+                                             src_width, src_type, fetch_dst_type,
+                                             aligned, base_ptr, offsets, i,
+                                             vector_justify);
+         if (!vec_fetch) {
+            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+         }
       }
       if (vec_zext) {
-         res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
          if (vector_justify) {
 #if PIPE_ARCH_BIG_ENDIAN
-            struct lp_type dst_type;
-            unsigned sv = dst_width - src_width;
-            dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+            unsigned sv = dst_type.width - src_width;
             res = LLVMBuildShl(gallivm->builder, res,
-                               lp_build_const_int_vec(gallivm, dst_type, sv), "");
+                               lp_build_const_int_vec(gallivm, res_type, sv), "");
 #endif
          }
       }
+      if (vec_fetch) {
+         /*
+          * Do bitcast now otherwise llvm might get some funny ideas wrt
+          * float/int types...
+          */
+         for (i = 0; i < length; i++) {
+            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+                                        lp_build_vec_type(gallivm, dst_type), "");
+         }
+         res = lp_build_concat(gallivm, elems, dst_type, length);
+      } else {
+         struct lp_type really_final_type = dst_type;
+         assert(res_type.length * res_type.width ==
+                dst_type.length * dst_type.width * length);
+         really_final_type.length *= length;
+         res = LLVMBuildBitCast(gallivm->builder, res,
+                                lp_build_vec_type(gallivm, really_final_type), "");
+      }
    }
 
    return res;
author	Roland Scheidegger <[email protected]>	2016-12-11 23:39:22 +0100
committer	Roland Scheidegger <[email protected]>	2016-12-21 04:48:24 +0100
commit	8bd67a35c50e68c21aed043de11e095c284d151a (patch)
tree	990f5b573013944b86fca5fe923a9fcc79c88243 /src/gallium/auxiliary/gallivm/lp_bld_gather.c
parent	5b950319ced820ee112f38f69b5694179c15815d (diff)