8 files changed, 333 insertions, 79 deletions
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index c5485728e42..19b75a5003b 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1864,7 +1864,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                                        LLVMPointerType(LLVMInt8TypeInContext(context),
                                                        0), "");
          tmp = lp_build_gather(gallivm, vs_type.length,
-                               32, 32, TRUE,
+                               32, bld.type, TRUE,
                                fetch_elts, tmp, FALSE);
          LLVMBuildStore(builder, tmp, index_store);
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9fb6f..322e7b817db 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -224,6 +224,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
    /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
     * into masked = {X, Y, Z, W}
     */
+   /* Note: we cannot do this shift on x86 natively until AVX2. */
    shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
 
@@ -394,6 +395,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        util_is_power_of_two(format_desc->block.bits)) {
       LLVMValueRef packed;
       LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+      struct lp_type fetch_type;
       unsigned vec_len = type.width * type.length;
 
       /*
@@ -401,8 +403,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        * scaling or converting.
        */
 
+      fetch_type = lp_type_uint(type.width*4);
       packed = lp_build_gather(gallivm, type.length/4,
-                               format_desc->block.bits, type.width*4,
+                               format_desc->block.bits, fetch_type,
                                aligned, base_ptr, offset, TRUE);
 
       assert(format_desc->block.bits <= vec_len);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6fc65..636a4a6238f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
 
    src_vec_type  = lp_build_vec_type(gallivm,  src_type);
 
-   /* Read whole vector from memory, unaligned */
+   /*
+    * Read whole vector from memory, unaligned.
+    * XXX: Note it's actually aligned to element type. Not sure if all
+    * callers are able to guarantee that (whereas for others, we should
+    * be able to use full alignment when there's 2 or 4 channels).
+    * (If all callers can guarantee element type alignment, we should
+    * relax alignment restrictions elsewhere.)
+    */
    ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
    ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
    res = LLVMBuildLoad(builder, ptr, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index a48d71f0903..b3bc15552c5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -405,6 +405,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
+   enum pipe_format format = format_desc->format;
+   struct lp_type fetch_type;
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -430,10 +432,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
        */
       assert(format_desc->block.bits <= type.width);
+      fetch_type = lp_type_uint(type.width);
       packed = lp_build_gather(gallivm,
                                type.length,
                                format_desc->block.bits,
-                               type.width,
+                               fetch_type,
                                aligned,
                                base_ptr, offset, FALSE);
 
@@ -447,22 +450,23 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
-   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
-       format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
       /*
        * similar conceptually to above but requiring special
        * AoS packed -> SoA float conversion code.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
       assert(type.floating);
       assert(type.width == 32);
 
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
-                               type.width, aligned,
+                               fetch_type, aligned,
                                base_ptr, offset, FALSE);
-      if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
       else {
@@ -478,8 +482,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
        * 32bit (or 8bit) from each block.
        */
       LLVMValueRef packed;
+      struct lp_type fetch_type = lp_type_uint(type.width);
 
-      if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+      if (format == PIPE_FORMAT_X32_S8X24_UINT) {
          /*
           * for stencil simply fix up offsets - could in fact change
           * base_ptr instead even outside the shader.
@@ -487,14 +492,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          unsigned mask = (1 << 8) - 1;
          LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
          offset = LLVMBuildAdd(builder, offset, s_offset, "");
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, FALSE);
          packed = LLVMBuildAnd(builder, packed,
                                lp_build_const_int_vec(gallivm, type, mask), "");
       }
       else {
-         assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
-         packed = lp_build_gather(gallivm, type.length, 32, type.width,
+         assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+         packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
                                   aligned, base_ptr, offset, TRUE);
          packed = LLVMBuildBitCast(builder, packed,
                                    lp_build_vec_type(gallivm, type), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index fa0e8b656bb..d6d7552986e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
 {
    LLVMValueRef packed;
    LLVMValueRef rgba;
+   struct lp_type fetch_type;
 
    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
    assert(format_desc->block.bits == 32);
    assert(format_desc->block.width == 2);
    assert(format_desc->block.height == 1);
 
-   packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+   fetch_type = lp_type_uint(32);
+   packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
 
    (void)j;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 1f7ba927bc4..7654ba01d88 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,6 +28,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_const.h"
 #include "lp_bld_format.h"
@@ -36,6 +37,7 @@
 #include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -114,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
     * translation of offsets to first_elem in sampler_views it actually seems
     * gallium could not do anything else except 16 no matter what...
     */
-  if (!aligned) {
+   if (!aligned) {
       LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
    }
 
    assert(src_width <= dst_width);
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
-   } else if (src_width < dst_width) {
+   if (src_width < dst_width) {
       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
       if (vector_justify) {
 #ifdef PIPE_ARCH_BIG_ENDIAN
@@ -135,28 +152,134 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+                         unsigned length,
+                         unsigned src_width,
+                         LLVMTypeRef src_type,
+                         struct lp_type dst_type,
+                         boolean aligned,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i,
+                         boolean vector_justify)
+{
+   LLVMValueRef ptr, res;
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+   /* XXX
+    * On some archs we probably really want to avoid having to deal
+    * with alignments lower than 4 bytes (if fetch size is a power of
+    * two >= 32). On x86 it doesn't matter, however.
+    * We should be able to guarantee full alignment for any kind of texture
+    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+    * but I don't think that's quite what we wanted).
+    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+    * enforcing what we want (which is what d3d10 does, the offset needs to
+    * be aligned to element size, but GL has bytes regardless of element
+    * size which would only leave us with minimum alignment restriction of 16
+    * which doesn't make much sense if the type isn't 4x32bit). Due to
+    * translation of offsets to first_elem in sampler_views it actually seems
+    * gallium could not do anything else except 16 no matter what...
+    */
+   if (!aligned) {
+      LLVMSetAlignment(res, 1);
+   } else if (!util_is_power_of_two(src_width)) {
+      /*
+       * Full alignment is impossible, assume the caller really meant
+       * the individual elements were aligned (e.g. 3x32bit format).
+       * And yes the generated code may otherwise crash, llvm will
+       * really assume 128bit alignment with a 96bit fetch (I suppose
+       * that makes sense as it can just assume the upper 32bit to be
+       * whatever).
+       * Maybe the caller should be able to explicitly set this, but
+       * this should cover all the 3-channel formats.
+       */
+      if (((src_width / 24) * 24 == src_width) &&
+           util_is_power_of_two(src_width / 24)) {
+          LLVMSetAlignment(res, src_width / 24);
+      } else {
+         LLVMSetAlignment(res, 1);
+      }
+   }
+
+   assert(src_width <= dst_type.width * dst_type.length);
+   if (src_width < dst_type.width * dst_type.length) {
+      if (dst_type.length > 1) {
+         res = lp_build_pad_vector(gallivm, res, dst_type.length);
+         /*
+          * vector_justify hopefully a non-issue since we only deal
+          * with src_width >= 32 here?
+          */
+      } else {
+         /*
+          * Only valid if src_ptr_type is int type...
+          */
+         res = LLVMBuildZExt(gallivm->builder, res,
+                             lp_build_vec_type(gallivm, dst_type), "");
+         if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+         res = LLVMBuildShl(gallivm->builder, res,
+                            LLVMConstInt(dst_elem_type,
+                                         dst_type.width - src_width, 0), "");
+#endif
+         }
+      }
+   }
+   return res;
+}
+
+
+
+
 static LLVMValueRef
 lp_build_gather_avx2(struct gallivm_state *gallivm,
                      unsigned length,
                      unsigned src_width,
-                     unsigned dst_width,
+                     struct lp_type dst_type,
                      LLVMValueRef base_ptr,
                      LLVMValueRef offsets)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
-   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
-   LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+   LLVMTypeRef src_type, src_vec_type;
    LLVMValueRef res;
+   struct lp_type res_type = dst_type;
+   res_type.length *= length;
 
+   if (dst_type.floating) {
+      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+                                   LLVMFloatTypeInContext(gallivm->context);
+   } else {
+      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+   }
+   src_vec_type = LLVMVectorType(src_type, length);
+
+   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
 
    if (0) {
       /*
        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
-       * will not use the AVX2 gather instrinsics.  See
+       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+       * least with Haswell. See
        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+       * And the generated code doing the emulation is quite a bit worse
+       * than what we get by doing it ourselves too.
        */
       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -176,7 +299,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
       src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
 
       char intrinsic[64];
-      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+                    length, dst_type.floating ? "f" : "i", src_width);
       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -185,27 +309,36 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
    } else {
-      assert(src_width == 32);
-
       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
-      /*
-       * We should get the caller to give more type information so we can use
-       * the intrinsics for the right int/float domain.  Int should be the most
-       * common.
-       */
       const char *intrinsic = NULL;
-      switch (length) {
-      case 4:
-         intrinsic = "llvm.x86.avx2.gather.d.d";
-         break;
-      case 8:
-         intrinsic = "llvm.x86.avx2.gather.d.d.256";
-         break;
-      default:
-         assert(0);
+      unsigned l_idx = 0;
+
+      assert(src_width == 32 || src_width == 64);
+      if (src_width == 32) {
+         assert(length == 4 || length == 8);
+      } else {
+         assert(length == 2 || length == 4);
       }
 
+      static const char *intrinsics[2][2][2] = {
+
+         {{"llvm.x86.avx2.gather.d.d",
+           "llvm.x86.avx2.gather.d.d.256"},
+          {"llvm.x86.avx2.gather.d.q",
+           "llvm.x86.avx2.gather.d.q.256"}},
+
+         {{"llvm.x86.avx2.gather.d.ps",
+           "llvm.x86.avx2.gather.d.ps.256"},
+          {"llvm.x86.avx2.gather.d.pd",
+           "llvm.x86.avx2.gather.d.pd.256"}},
+      };
+
+      if ((src_width == 32 && length == 8) ||
+          (src_width == 64 && length == 4)) {
+         l_idx = 1;
+      }
+      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
+
       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
       mask = LLVMConstBitCast(mask, src_vec_type);
@@ -215,12 +348,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
 
       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
    }
-
-   if (src_width > dst_width) {
-      res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
-   } else if (src_width < dst_width) {
-      res = LLVMBuildZExt(builder, res, dst_vec_type, "");
-   }
+   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
 
    return res;
 }
@@ -241,9 +369,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
  *
  * @param length length of the offsets
  * @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ *        but truncation is not allowed)
+ *        (this may be a vector, must be pot sized)
  * @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
  * @param offsets vector with offsets
  * @param vector_justify select vector rather than integer justification
  */
@@ -251,41 +381,121 @@ LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
                 boolean vector_justify)
 {
    LLVMValueRef res;
+   boolean need_expansion = src_width < dst_type.width * dst_type.length;
+   boolean vec_fetch;
+   struct lp_type fetch_type, fetch_dst_type;
+   LLVMTypeRef src_type;
+
+   assert(src_width <= dst_type.width * dst_type.length);
+
+   /*
+    * This is quite a mess...
+    * Figure out if the fetch should be done as:
+    * a) scalar or vector
+    * b) float or int
+    *
+    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+    * zext will cause extra instructions.
+    * However, the same isn't true for 3x16bit (the codegen for that is
+    * completely worthless on x86 simd, and for 3x8bit is is way worse
+    * still, don't try that... (To get really good code out of llvm for
+    * these cases, the only way is to decompose the fetches manually
+    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+    * case requires sse41, otherwise simple scalar zext is way better.
+    * But probably not important enough, so don't bother.)
+    * Also, we try to honor the floating bit of destination (but isn't
+    * possible if caller asks for instance for 2x32bit dst_type with
+    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+    * cast to 2x32f type, so the fetch is always int and on top of that
+    * we avoid the vec pad and use scalar zext due the above mentioned
+    * issue).
+    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+    * for other archs if necessary...
+    */
+   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+       (dst_type.length > 1)) {
+      /* use vector fetch (if dst_type is vector) */
+      vec_fetch = TRUE;
+      if (dst_type.floating) {
+         fetch_type = lp_type_float_vec(dst_type.width, src_width);
+      } else {
+         fetch_type = lp_type_int_vec(dst_type.width, src_width);
+      }
+      /* intentionally not using lp_build_vec_type here */
+      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+                                fetch_type.length);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.length = dst_type.length;
+    } else {
+      /* use scalar fetch */
+      vec_fetch = FALSE;
+      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+         fetch_type = lp_type_float(src_width);
+      } else {
+         fetch_type = lp_type_int(src_width);
+      }
+      src_type = lp_build_vec_type(gallivm, fetch_type);
+      fetch_dst_type = fetch_type;
+      fetch_dst_type.width = dst_type.width * dst_type.length;
+   }
 
    if (length == 1) {
       /* Scalar */
-      return lp_build_gather_elem(gallivm, length,
-                                  src_width, dst_width, aligned,
-                                  base_ptr, offsets, 0, vector_justify);
-   } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
-      return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+      res = lp_build_gather_elem_vec(gallivm, length,
+                                     src_width, src_type, fetch_dst_type,
+                                     aligned, base_ptr, offsets, 0,
+                                     vector_justify);
+      return LLVMBuildBitCast(gallivm->builder, res,
+                              lp_build_vec_type(gallivm, dst_type), "");
+      /*
+       * Excluding expansion from these paths because if you need it for
+       * 32bit/64bit fetches you're doing it wrong (this is gather, not
+       * conversion) and it would be awkward for floats.
+       */
+   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 32 && (length == 4 || length == 8)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
+   /*
+    * This looks bad on paper wrt throughtput/latency on Haswell.
+    * Even on Broadwell it doesn't look stellar.
+    * Albeit no measurements were done (but tested to work).
+    * Should definitely enable on Skylake.
+    * (In general, should be more of a win if the fetch is 256bit wide -
+    * this is true for the 32bit case above too.)
+    */
+   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+              src_width == 64 && (length == 2 || length == 4)) {
+      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                  base_ptr, offsets);
    } else {
       /* Vector */
 
-      LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
-      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-      LLVMTypeRef gather_vec_type = dst_vec_type;
+      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
       unsigned i;
       boolean vec_zext = FALSE;
-      unsigned gather_width = dst_width;
+      struct lp_type res_type, gather_res_type;
+      LLVMTypeRef res_t, gather_res_t;
 
+      res_type = fetch_dst_type;
+      res_type.length *= length;
+      gather_res_type = res_type;
 
-      if (src_width == 16 && dst_width == 32) {
-         LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2);
-         gather_vec_type = LLVMVectorType(g_elem_type, length);
+      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
          /*
           * Note that llvm is never able to optimize zext/insert combos
           * directly (i.e. zero the simd reg, then place the elements into
-          * the appropriate place directly). And 16->32bit zext simd loads
+          * the appropriate place directly). (I think this has to do with
+          * scalar/vector transition.) And scalar 16->32bit zext simd loads
           * aren't possible (instead loading to scalar reg first).
-          * (I think this has to do with scalar/vector transition.)
           * No idea about other archs...
           * We could do this manually, but instead we just use a vector
           * zext, which is simple enough (and, in fact, llvm might optimize
@@ -293,30 +503,53 @@ lp_build_gather(struct gallivm_state *gallivm,
           * (We're not trying that with other bit widths as that might not be
           * easier, in particular with 8 bit values at least with only sse2.)
           */
+         assert(vec_fetch == FALSE);
+         gather_res_type.width /= 2;
+         fetch_dst_type = fetch_type;
+         src_type = lp_build_vec_type(gallivm, fetch_type);
          vec_zext = TRUE;
-         gather_width = 16;
       }
-      res = LLVMGetUndef(gather_vec_type);
+      res_t = lp_build_vec_type(gallivm, res_type);
+      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+      res = LLVMGetUndef(gather_res_t);
       for (i = 0; i < length; ++i) {
          LLVMValueRef index = lp_build_const_int32(gallivm, i);
-         LLVMValueRef elem;
-         elem = lp_build_gather_elem(gallivm, length,
-                                     src_width, gather_width, aligned,
-                                     base_ptr, offsets, i, vector_justify);
-         res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+         elems[i] = lp_build_gather_elem_vec(gallivm, length,
+                                             src_width, src_type, fetch_dst_type,
+                                             aligned, base_ptr, offsets, i,
+                                             vector_justify);
+         if (!vec_fetch) {
+            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+         }
       }
       if (vec_zext) {
-         res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
          if (vector_justify) {
 #if PIPE_ARCH_BIG_ENDIAN
-            struct lp_type dst_type;
-            unsigned sv = dst_width - src_width;
-            dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+            unsigned sv = dst_type.width - src_width;
             res = LLVMBuildShl(gallivm->builder, res,
-                               lp_build_const_int_vec(gallivm, dst_type, sv), "");
+                               lp_build_const_int_vec(gallivm, res_type, sv), "");
 #endif
          }
       }
+      if (vec_fetch) {
+         /*
+          * Do bitcast now otherwise llvm might get some funny ideas wrt
+          * float/int types...
+          */
+         for (i = 0; i < length; i++) {
+            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+                                        lp_build_vec_type(gallivm, dst_type), "");
+         }
+         res = lp_build_concat(gallivm, elems, dst_type, length);
+      } else {
+         struct lp_type really_final_type = dst_type;
+         assert(res_type.length * res_type.width ==
+                dst_type.length * dst_type.width * length);
+         really_final_type.length *= length;
+         res = LLVMBuildBitCast(gallivm->builder, res,
+                                lp_build_vec_type(gallivm, really_final_type), "");
+      }
    }
 
    return res;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index 3ede4763a70..7930864e611 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@ LLVMValueRef
 lp_build_gather(struct gallivm_state *gallivm,
                 unsigned length,
                 unsigned src_width,
-                unsigned dst_width,
+                struct lp_type dst_type,
                 boolean aligned,
                 LLVMValueRef base_ptr,
                 LLVMValueRef offsets,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index f91b761dc11..c46749dbac8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
    LLVMValueRef rgba8;
    struct lp_build_context u8n;
    LLVMTypeRef u8n_vec_type;
+   struct lp_type fetch_type;
 
    lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
    u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
+   fetch_type = lp_type_uint(bld->texel_type.width);
    if (util_format_is_rgba8_variant(bld->format_desc)) {
       /*
        * Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
       rgba8 = lp_build_gather(bld->gallivm,
                               bld->texel_type.length,
                               bld->format_desc->block.bits,
-                              bld->texel_type.width,
+                              fetch_type,
                               TRUE,
                               data_ptr, offset, TRUE);
 
@@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
             LLVMValueRef rgba8;
 
             if (util_format_is_rgba8_variant(bld->format_desc)) {
+               struct lp_type fetch_type;
                /*
                 * Given the format is a rgba8, just read the pixels as is,
                 * without any swizzling. Swizzling will be done later.
                 */
+               fetch_type = lp_type_uint(bld->texel_type.width);
                rgba8 = lp_build_gather(bld->gallivm,
                                        bld->texel_type.length,
                                        bld->format_desc->block.bits,
-                                       bld->texel_type.width,
+                                       fetch_type,
                                        TRUE,
                                        data_ptr, offset[k][j][i], TRUE);