summaryrefslogtreecommitdiffstats
path: root/src/gallium/auxiliary/gallivm
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/gallivm')
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos.c5
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c9
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_soa.c23
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c4
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_gather.c359
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_gather.h2
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c8
7 files changed, 332 insertions, 78 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9fb6f..322e7b817db 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -224,6 +224,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm,
/* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
* into masked = {X, Y, Z, W}
*/
+ /* Note: we cannot do this shift on x86 natively until AVX2. */
shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
@@ -394,6 +395,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
util_is_power_of_two(format_desc->block.bits)) {
LLVMValueRef packed;
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+ struct lp_type fetch_type;
unsigned vec_len = type.width * type.length;
/*
@@ -401,8 +403,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
* scaling or converting.
*/
+ fetch_type = lp_type_uint(type.width*4);
packed = lp_build_gather(gallivm, type.length/4,
- format_desc->block.bits, type.width*4,
+ format_desc->block.bits, fetch_type,
aligned, base_ptr, offset, TRUE);
assert(format_desc->block.bits <= vec_len);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6fc65..636a4a6238f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
src_vec_type = lp_build_vec_type(gallivm, src_type);
- /* Read whole vector from memory, unaligned */
+ /*
+ * Read whole vector from memory, unaligned.
+ * XXX: Note it's actually aligned to element type. Not sure if all
+ * callers are able to guarantee that (whereas for others, we should
+ * be able to use full alignment when there's 2 or 4 channels).
+ * (If all callers can guarantee element type alignment, we should
+ * relax alignment restrictions elsewhere.)
+ */
ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
res = LLVMBuildLoad(builder, ptr, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index a48d71f0903..b3bc15552c5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -405,6 +405,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
LLVMValueRef rgba_out[4])
{
LLVMBuilderRef builder = gallivm->builder;
+ enum pipe_format format = format_desc->format;
+ struct lp_type fetch_type;
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
@@ -430,10 +432,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
* Ex: packed = {XYZW, XYZW, XYZW, XYZW}
*/
assert(format_desc->block.bits <= type.width);
+ fetch_type = lp_type_uint(type.width);
packed = lp_build_gather(gallivm,
type.length,
format_desc->block.bits,
- type.width,
+ fetch_type,
aligned,
base_ptr, offset, FALSE);
@@ -447,22 +450,23 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
return;
}
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
- format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
/*
* similar conceptually to above but requiring special
* AoS packed -> SoA float conversion code.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
assert(type.floating);
assert(type.width == 32);
packed = lp_build_gather(gallivm, type.length,
format_desc->block.bits,
- type.width, aligned,
+ fetch_type, aligned,
base_ptr, offset, FALSE);
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
}
else {
@@ -478,8 +482,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
* 32bit (or 8bit) from each block.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
- if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+ if (format == PIPE_FORMAT_X32_S8X24_UINT) {
/*
* for stencil simply fix up offsets - could in fact change
* base_ptr instead even outside the shader.
@@ -487,14 +492,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
unsigned mask = (1 << 8) - 1;
LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
offset = LLVMBuildAdd(builder, offset, s_offset, "");
- packed = lp_build_gather(gallivm, type.length, 32, type.width,
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
aligned, base_ptr, offset, FALSE);
packed = LLVMBuildAnd(builder, packed,
lp_build_const_int_vec(gallivm, type, mask), "");
}
else {
- assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
- packed = lp_build_gather(gallivm, type.length, 32, type.width,
+ assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
aligned, base_ptr, offset, TRUE);
packed = LLVMBuildBitCast(builder, packed,
lp_build_vec_type(gallivm, type), "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index fa0e8b656bb..d6d7552986e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
{
LLVMValueRef packed;
LLVMValueRef rgba;
+ struct lp_type fetch_type;
assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
assert(format_desc->block.bits == 32);
assert(format_desc->block.width == 2);
assert(format_desc->block.height == 1);
- packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE);
+ fetch_type = lp_type_uint(32);
+ packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE);
(void)j;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 1f7ba927bc4..7654ba01d88 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -28,6 +28,7 @@
#include "util/u_debug.h"
#include "util/u_cpu_detect.h"
+#include "util/u_math.h"
#include "lp_bld_debug.h"
#include "lp_bld_const.h"
#include "lp_bld_format.h"
@@ -36,6 +37,7 @@
#include "lp_bld_type.h"
#include "lp_bld_init.h"
#include "lp_bld_intr.h"
+#include "lp_bld_pack.h"
/**
@@ -114,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
* translation of offsets to first_elem in sampler_views it actually seems
* gallium could not do anything else except 16 no matter what...
*/
- if (!aligned) {
+ if (!aligned) {
LLVMSetAlignment(res, 1);
+ } else if (!util_is_power_of_two(src_width)) {
+ /*
+ * Full alignment is impossible, assume the caller really meant
+ * the individual elements were aligned (e.g. 3x32bit format).
+ * And yes the generated code may otherwise crash, llvm will
+ * really assume 128bit alignment with a 96bit fetch (I suppose
+ * that makes sense as it can just assume the upper 32bit to be
+ * whatever).
+ * Maybe the caller should be able to explicitly set this, but
+ * this should cover all the 3-channel formats.
+ */
+ if (((src_width / 24) * 24 == src_width) &&
+ util_is_power_of_two(src_width / 24)) {
+ LLVMSetAlignment(res, src_width / 24);
+ } else {
+ LLVMSetAlignment(res, 1);
+ }
}
assert(src_width <= dst_width);
- if (src_width > dst_width) {
- res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, "");
- } else if (src_width < dst_width) {
+ if (src_width < dst_width) {
res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
if (vector_justify) {
#ifdef PIPE_ARCH_BIG_ENDIAN
@@ -135,28 +152,134 @@ lp_build_gather_elem(struct gallivm_state *gallivm,
}
+/**
+ * Gather one element from scatter positions in memory.
+ * Nearly the same as above, however the individual elements
+ * may be vectors themselves, and fetches may be float type.
+ * Can also do pad vector instead of ZExt.
+ *
+ * @sa lp_build_gather()
+ */
+static LLVMValueRef
+lp_build_gather_elem_vec(struct gallivm_state *gallivm,
+ unsigned length,
+ unsigned src_width,
+ LLVMTypeRef src_type,
+ struct lp_type dst_type,
+ boolean aligned,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offsets,
+ unsigned i,
+ boolean vector_justify)
+{
+ LLVMValueRef ptr, res;
+ LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+ assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
+
+ ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
+ ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
+ res = LLVMBuildLoad(gallivm->builder, ptr, "");
+
+ /* XXX
+ * On some archs we probably really want to avoid having to deal
+ * with alignments lower than 4 bytes (if fetch size is a power of
+ * two >= 32). On x86 it doesn't matter, however.
+ * We should be able to guarantee full alignment for any kind of texture
+ * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
+ * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
+ * but I don't think that's quite what we wanted).
+ * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
+ * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
+ * enforcing what we want (which is what d3d10 does, the offset needs to
+ * be aligned to element size, but GL has bytes regardless of element
+ * size which would only leave us with minimum alignment restriction of 16
+ * which doesn't make much sense if the type isn't 4x32bit). Due to
+ * translation of offsets to first_elem in sampler_views it actually seems
+ * gallium could not do anything else except 16 no matter what...
+ */
+ if (!aligned) {
+ LLVMSetAlignment(res, 1);
+ } else if (!util_is_power_of_two(src_width)) {
+ /*
+ * Full alignment is impossible, assume the caller really meant
+ * the individual elements were aligned (e.g. 3x32bit format).
+ * And yes the generated code may otherwise crash, llvm will
+ * really assume 128bit alignment with a 96bit fetch (I suppose
+ * that makes sense as it can just assume the upper 32bit to be
+ * whatever).
+ * Maybe the caller should be able to explicitly set this, but
+ * this should cover all the 3-channel formats.
+ */
+ if (((src_width / 24) * 24 == src_width) &&
+ util_is_power_of_two(src_width / 24)) {
+ LLVMSetAlignment(res, src_width / 24);
+ } else {
+ LLVMSetAlignment(res, 1);
+ }
+ }
+
+ assert(src_width <= dst_type.width * dst_type.length);
+ if (src_width < dst_type.width * dst_type.length) {
+ if (dst_type.length > 1) {
+ res = lp_build_pad_vector(gallivm, res, dst_type.length);
+ /*
+ * vector_justify hopefully a non-issue since we only deal
+ * with src_width >= 32 here?
+ */
+ } else {
+ /*
+ * Only valid if src_ptr_type is int type...
+ */
+ res = LLVMBuildZExt(gallivm->builder, res,
+ lp_build_vec_type(gallivm, dst_type), "");
+ if (vector_justify) {
+#ifdef PIPE_ARCH_BIG_ENDIAN
+ res = LLVMBuildShl(gallivm->builder, res,
+ LLVMConstInt(dst_elem_type,
+ dst_type.width - src_width, 0), "");
+#endif
+ }
+ }
+ }
+ return res;
+}
+
+
+
+
static LLVMValueRef
lp_build_gather_avx2(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
LLVMValueRef base_ptr,
LLVMValueRef offsets)
{
LLVMBuilderRef builder = gallivm->builder;
- LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width);
- LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length);
- LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
- LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length);
+ LLVMTypeRef src_type, src_vec_type;
LLVMValueRef res;
+ struct lp_type res_type = dst_type;
+ res_type.length *= length;
+ if (dst_type.floating) {
+ src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
+ LLVMFloatTypeInContext(gallivm->context);
+ } else {
+ src_type = LLVMIntTypeInContext(gallivm->context, src_width);
+ }
+ src_vec_type = LLVMVectorType(src_type, length);
+
+ /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
if (0) {
/*
* XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
- * will not use the AVX2 gather instrinsics. See
+ * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
+ * least with Haswell. See
* http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
+ * And the generated code doing the emulation is quite a bit worse
+ * than what we get by doing it ourselves too.
*/
LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
@@ -176,7 +299,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
char intrinsic[64];
- util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width);
+ util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
+ length, dst_type.floating ? "f" : "i", src_width);
LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
@@ -185,27 +309,36 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
} else {
- assert(src_width == 32);
-
LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
-
- /*
- * We should get the caller to give more type information so we can use
- * the intrinsics for the right int/float domain. Int should be the most
- * common.
- */
const char *intrinsic = NULL;
- switch (length) {
- case 4:
- intrinsic = "llvm.x86.avx2.gather.d.d";
- break;
- case 8:
- intrinsic = "llvm.x86.avx2.gather.d.d.256";
- break;
- default:
- assert(0);
+ unsigned l_idx = 0;
+
+ assert(src_width == 32 || src_width == 64);
+ if (src_width == 32) {
+ assert(length == 4 || length == 8);
+ } else {
+ assert(length == 2 || length == 4);
}
+ static const char *intrinsics[2][2][2] = {
+
+ {{"llvm.x86.avx2.gather.d.d",
+ "llvm.x86.avx2.gather.d.d.256"},
+ {"llvm.x86.avx2.gather.d.q",
+ "llvm.x86.avx2.gather.d.q.256"}},
+
+ {{"llvm.x86.avx2.gather.d.ps",
+ "llvm.x86.avx2.gather.d.ps.256"},
+ {"llvm.x86.avx2.gather.d.pd",
+ "llvm.x86.avx2.gather.d.pd.256"}},
+ };
+
+ if ((src_width == 32 && length == 8) ||
+ (src_width == 64 && length == 4)) {
+ l_idx = 1;
+ }
+ intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
+
LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
mask = LLVMConstBitCast(mask, src_vec_type);
@@ -215,12 +348,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
}
-
- if (src_width > dst_width) {
- res = LLVMBuildTrunc(builder, res, dst_vec_type, "");
- } else if (src_width < dst_width) {
- res = LLVMBuildZExt(builder, res, dst_vec_type, "");
- }
+ res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
return res;
}
@@ -241,9 +369,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm,
*
* @param length length of the offsets
* @param src_width src element width in bits
- * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param dst_type result element type (src will be expanded to fit,
+ * but truncation is not allowed)
+ * (this may be a vector, must be pot sized)
* @param aligned whether the data is guaranteed to be aligned (to src_width)
- * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param base_ptr base pointer, needs to be a i8 pointer type.
* @param offsets vector with offsets
* @param vector_justify select vector rather than integer justification
*/
@@ -251,41 +381,121 @@ LLVMValueRef
lp_build_gather(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offsets,
boolean vector_justify)
{
LLVMValueRef res;
+ boolean need_expansion = src_width < dst_type.width * dst_type.length;
+ boolean vec_fetch;
+ struct lp_type fetch_type, fetch_dst_type;
+ LLVMTypeRef src_type;
+
+ assert(src_width <= dst_type.width * dst_type.length);
+
+ /*
+ * This is quite a mess...
+ * Figure out if the fetch should be done as:
+ * a) scalar or vector
+ * b) float or int
+ *
+ * As an example, for a 96bit fetch expanded into 4x32bit, it is better
+ * to use (3x32bit) vector type (then pad the vector). Otherwise, the
+ * zext will cause extra instructions.
+ * However, the same isn't true for 3x16bit (the codegen for that is
+ * completely worthless on x86 simd, and for 3x8bit is is way worse
+ * still, don't try that... (To get really good code out of llvm for
+ * these cases, the only way is to decompose the fetches manually
+ * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
+ * case requires sse41, otherwise simple scalar zext is way better.
+ * But probably not important enough, so don't bother.)
+ * Also, we try to honor the floating bit of destination (but isn't
+ * possible if caller asks for instance for 2x32bit dst_type with
+ * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
+ * cast to 2x32f type, so the fetch is always int and on top of that
+ * we avoid the vec pad and use scalar zext due the above mentioned
+ * issue).
+ * Note this is optimized for x86 sse2 and up backend. Could be tweaked
+ * for other archs if necessary...
+ */
+ if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
+ (dst_type.length > 1)) {
+ /* use vector fetch (if dst_type is vector) */
+ vec_fetch = TRUE;
+ if (dst_type.floating) {
+ fetch_type = lp_type_float_vec(dst_type.width, src_width);
+ } else {
+ fetch_type = lp_type_int_vec(dst_type.width, src_width);
+ }
+ /* intentionally not using lp_build_vec_type here */
+ src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
+ fetch_type.length);
+ fetch_dst_type = fetch_type;
+ fetch_dst_type.length = dst_type.length;
+ } else {
+ /* use scalar fetch */
+ vec_fetch = FALSE;
+ if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
+ fetch_type = lp_type_float(src_width);
+ } else {
+ fetch_type = lp_type_int(src_width);
+ }
+ src_type = lp_build_vec_type(gallivm, fetch_type);
+ fetch_dst_type = fetch_type;
+ fetch_dst_type.width = dst_type.width * dst_type.length;
+ }
if (length == 1) {
/* Scalar */
- return lp_build_gather_elem(gallivm, length,
- src_width, dst_width, aligned,
- base_ptr, offsets, 0, vector_justify);
- } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) {
- return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets);
+ res = lp_build_gather_elem_vec(gallivm, length,
+ src_width, src_type, fetch_dst_type,
+ aligned, base_ptr, offsets, 0,
+ vector_justify);
+ return LLVMBuildBitCast(gallivm->builder, res,
+ lp_build_vec_type(gallivm, dst_type), "");
+ /*
+ * Excluding expansion from these paths because if you need it for
+ * 32bit/64bit fetches you're doing it wrong (this is gather, not
+ * conversion) and it would be awkward for floats.
+ */
+ } else if (util_cpu_caps.has_avx2 && !need_expansion &&
+ src_width == 32 && (length == 4 || length == 8)) {
+ return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+ base_ptr, offsets);
+ /*
+ * This looks bad on paper wrt throughtput/latency on Haswell.
+ * Even on Broadwell it doesn't look stellar.
+ * Albeit no measurements were done (but tested to work).
+ * Should definitely enable on Skylake.
+ * (In general, should be more of a win if the fetch is 256bit wide -
+ * this is true for the 32bit case above too.)
+ */
+ } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
+ src_width == 64 && (length == 2 || length == 4)) {
+ return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+ base_ptr, offsets);
} else {
/* Vector */
- LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
- LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
- LLVMTypeRef gather_vec_type = dst_vec_type;
+ LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
unsigned i;
boolean vec_zext = FALSE;
- unsigned gather_width = dst_width;
+ struct lp_type res_type, gather_res_type;
+ LLVMTypeRef res_t, gather_res_t;
+ res_type = fetch_dst_type;
+ res_type.length *= length;
+ gather_res_type = res_type;
- if (src_width == 16 && dst_width == 32) {
- LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2);
- gather_vec_type = LLVMVectorType(g_elem_type, length);
+ if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
/*
* Note that llvm is never able to optimize zext/insert combos
* directly (i.e. zero the simd reg, then place the elements into
- * the appropriate place directly). And 16->32bit zext simd loads
+ * the appropriate place directly). (I think this has to do with
+ * scalar/vector transition.) And scalar 16->32bit zext simd loads
* aren't possible (instead loading to scalar reg first).
- * (I think this has to do with scalar/vector transition.)
* No idea about other archs...
* We could do this manually, but instead we just use a vector
* zext, which is simple enough (and, in fact, llvm might optimize
@@ -293,30 +503,53 @@ lp_build_gather(struct gallivm_state *gallivm,
* (We're not trying that with other bit widths as that might not be
* easier, in particular with 8 bit values at least with only sse2.)
*/
+ assert(vec_fetch == FALSE);
+ gather_res_type.width /= 2;
+ fetch_dst_type = fetch_type;
+ src_type = lp_build_vec_type(gallivm, fetch_type);
vec_zext = TRUE;
- gather_width = 16;
}
- res = LLVMGetUndef(gather_vec_type);
+ res_t = lp_build_vec_type(gallivm, res_type);
+ gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
+ res = LLVMGetUndef(gather_res_t);
for (i = 0; i < length; ++i) {
LLVMValueRef index = lp_build_const_int32(gallivm, i);
- LLVMValueRef elem;
- elem = lp_build_gather_elem(gallivm, length,
- src_width, gather_width, aligned,
- base_ptr, offsets, i, vector_justify);
- res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
+ elems[i] = lp_build_gather_elem_vec(gallivm, length,
+ src_width, src_type, fetch_dst_type,
+ aligned, base_ptr, offsets, i,
+ vector_justify);
+ if (!vec_fetch) {
+ res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
+ }
}
if (vec_zext) {
- res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+ res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
if (vector_justify) {
#if PIPE_ARCH_BIG_ENDIAN
- struct lp_type dst_type;
- unsigned sv = dst_width - src_width;
- dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+ unsigned sv = dst_type.width - src_width;
res = LLVMBuildShl(gallivm->builder, res,
- lp_build_const_int_vec(gallivm, dst_type, sv), "");
+ lp_build_const_int_vec(gallivm, res_type, sv), "");
#endif
}
}
+ if (vec_fetch) {
+ /*
+ * Do bitcast now otherwise llvm might get some funny ideas wrt
+ * float/int types...
+ */
+ for (i = 0; i < length; i++) {
+ elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
+ lp_build_vec_type(gallivm, dst_type), "");
+ }
+ res = lp_build_concat(gallivm, elems, dst_type, length);
+ } else {
+ struct lp_type really_final_type = dst_type;
+ assert(res_type.length * res_type.width ==
+ dst_type.length * dst_type.width * length);
+ really_final_type.length *= length;
+ res = LLVMBuildBitCast(gallivm->builder, res,
+ lp_build_vec_type(gallivm, really_final_type), "");
+ }
}
return res;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
index 3ede4763a70..7930864e611 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -55,7 +55,7 @@ LLVMValueRef
lp_build_gather(struct gallivm_state *gallivm,
unsigned length,
unsigned src_width,
- unsigned dst_width,
+ struct lp_type dst_type,
boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offsets,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index f91b761dc11..c46749dbac8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
LLVMValueRef rgba8;
struct lp_build_context u8n;
LLVMTypeRef u8n_vec_type;
+ struct lp_type fetch_type;
lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+ fetch_type = lp_type_uint(bld->texel_type.width);
if (util_format_is_rgba8_variant(bld->format_desc)) {
/*
* Given the format is a rgba8, just read the pixels as is,
@@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
rgba8 = lp_build_gather(bld->gallivm,
bld->texel_type.length,
bld->format_desc->block.bits,
- bld->texel_type.width,
+ fetch_type,
TRUE,
data_ptr, offset, TRUE);
@@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
LLVMValueRef rgba8;
if (util_format_is_rgba8_variant(bld->format_desc)) {
+ struct lp_type fetch_type;
/*
* Given the format is a rgba8, just read the pixels as is,
* without any swizzling. Swizzling will be done later.
*/
+ fetch_type = lp_type_uint(bld->texel_type.width);
rgba8 = lp_build_gather(bld->gallivm,
bld->texel_type.length,
bld->format_desc->block.bits,
- bld->texel_type.width,
+ fetch_type,
TRUE,
data_ptr, offset[k][j][i], TRUE);