diff options
-rw-r--r-- | src/gallium/auxiliary/draw/draw_llvm.c | 2 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 5 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c | 9 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 23 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c | 4 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_gather.c | 359 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_gather.h | 2 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c | 8 |
8 files changed, 333 insertions, 79 deletions
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index c5485728e42..19b75a5003b 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -1864,7 +1864,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMPointerType(LLVMInt8TypeInContext(context), 0), ""); tmp = lp_build_gather(gallivm, vs_type.length, - 32, 32, TRUE, + 32, bld.type, TRUE, fetch_elts, tmp, FALSE); LLVMBuildStore(builder, tmp, index_store); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 9f6b9e9fb6f..322e7b817db 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -224,6 +224,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} * into masked = {X, Y, Z, W} */ + /* Note: we cannot do this shift on x86 natively until AVX2. */ shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); @@ -394,6 +395,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); + struct lp_type fetch_type; unsigned vec_len = type.width * type.length; /* @@ -401,8 +403,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, * scaling or converting. */ + fetch_type = lp_type_uint(type.width*4); packed = lp_build_gather(gallivm, type.length/4, - format_desc->block.bits, type.width*4, + format_desc->block.bits, fetch_type, aligned, base_ptr, offset, TRUE); assert(format_desc->block.bits <= vec_len); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c index 8cad3a6fc65..636a4a6238f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c @@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, src_vec_type = lp_build_vec_type(gallivm, src_type); - /* Read whole vector from memory, unaligned */ + /* + * Read whole vector from memory, unaligned. + * XXX: Note it's actually aligned to element type. Not sure if all + * callers are able to guarantee that (whereas for others, we should + * be able to use full alignment when there's 2 or 4 channels). + * (If all callers can guarantee element type alignment, we should + * relax alignment restrictions elsewhere.) + */ ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), ""); res = LLVMBuildLoad(builder, ptr, ""); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index a48d71f0903..b3bc15552c5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -405,6 +405,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; + enum pipe_format format = format_desc->format; + struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || @@ -430,10 +432,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); + fetch_type = lp_type_uint(type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, - type.width, + fetch_type, aligned, base_ptr, offset, FALSE); @@ -447,22 +450,23 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, return; } - if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || - format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + if (format == PIPE_FORMAT_R11G11B10_FLOAT || + format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, - type.width, aligned, + fetch_type, aligned, base_ptr, offset, FALSE); - if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { + if (format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { @@ -478,8 +482,9 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, * 32bit (or 8bit) from each block. */ LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); - if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) { + if (format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. @@ -487,14 +492,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); - packed = lp_build_gather(gallivm, type.length, 32, type.width, + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { - assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); - packed = lp_build_gather(gallivm, type.length, 32, type.width, + assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c index fa0e8b656bb..d6d7552986e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -491,13 +491,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, { LLVMValueRef packed; LLVMValueRef rgba; + struct lp_type fetch_type; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED); assert(format_desc->block.bits == 32); assert(format_desc->block.width == 2); assert(format_desc->block.height == 1); - packed = lp_build_gather(gallivm, n, 32, 32, TRUE, base_ptr, offset, FALSE); + fetch_type = lp_type_uint(32); + packed = lp_build_gather(gallivm, n, 32, fetch_type, TRUE, base_ptr, offset, FALSE); (void)j; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c index 1f7ba927bc4..7654ba01d88 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c @@ -28,6 +28,7 @@ #include "util/u_debug.h" #include "util/u_cpu_detect.h" +#include "util/u_math.h" #include "lp_bld_debug.h" #include "lp_bld_const.h" #include "lp_bld_format.h" @@ -36,6 +37,7 @@ #include "lp_bld_type.h" #include "lp_bld_init.h" #include "lp_bld_intr.h" +#include "lp_bld_pack.h" /** @@ -114,14 +116,29 @@ lp_build_gather_elem(struct gallivm_state *gallivm, * translation of offsets to first_elem in sampler_views it actually seems * gallium could not do anything else except 16 no matter what... */ - if (!aligned) { + if (!aligned) { LLVMSetAlignment(res, 1); + } else if (!util_is_power_of_two(src_width)) { + /* + * Full alignment is impossible, assume the caller really meant + * the individual elements were aligned (e.g. 3x32bit format). + * And yes the generated code may otherwise crash, llvm will + * really assume 128bit alignment with a 96bit fetch (I suppose + * that makes sense as it can just assume the upper 32bit to be + * whatever). + * Maybe the caller should be able to explicitly set this, but + * this should cover all the 3-channel formats. + */ + if (((src_width / 24) * 24 == src_width) && + util_is_power_of_two(src_width / 24)) { + LLVMSetAlignment(res, src_width / 24); + } else { + LLVMSetAlignment(res, 1); + } } assert(src_width <= dst_width); - if (src_width > dst_width) { - res = LLVMBuildTrunc(gallivm->builder, res, dst_elem_type, ""); - } else if (src_width < dst_width) { + if (src_width < dst_width) { res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); if (vector_justify) { #ifdef PIPE_ARCH_BIG_ENDIAN @@ -135,28 +152,134 @@ lp_build_gather_elem(struct gallivm_state *gallivm, } +/** + * Gather one element from scatter positions in memory. + * Nearly the same as above, however the individual elements + * may be vectors themselves, and fetches may be float type. + * Can also do pad vector instead of ZExt. + * + * @sa lp_build_gather() + */ +static LLVMValueRef +lp_build_gather_elem_vec(struct gallivm_state *gallivm, + unsigned length, + unsigned src_width, + LLVMTypeRef src_type, + struct lp_type dst_type, + boolean aligned, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i, + boolean vector_justify) +{ + LLVMValueRef ptr, res; + LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); + + ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); + ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); + res = LLVMBuildLoad(gallivm->builder, ptr, ""); + + /* XXX + * On some archs we probably really want to avoid having to deal + * with alignments lower than 4 bytes (if fetch size is a power of + * two >= 32). On x86 it doesn't matter, however. + * We should be able to guarantee full alignment for any kind of texture + * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch + * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends + * but I don't think that's quite what we wanted). + * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT + * looks like a good fit, but it seems this cap bit (and OpenGL) aren't + * enforcing what we want (which is what d3d10 does, the offset needs to + * be aligned to element size, but GL has bytes regardless of element + * size which would only leave us with minimum alignment restriction of 16 + * which doesn't make much sense if the type isn't 4x32bit). Due to + * translation of offsets to first_elem in sampler_views it actually seems + * gallium could not do anything else except 16 no matter what... + */ + if (!aligned) { + LLVMSetAlignment(res, 1); + } else if (!util_is_power_of_two(src_width)) { + /* + * Full alignment is impossible, assume the caller really meant + * the individual elements were aligned (e.g. 3x32bit format). + * And yes the generated code may otherwise crash, llvm will + * really assume 128bit alignment with a 96bit fetch (I suppose + * that makes sense as it can just assume the upper 32bit to be + * whatever). + * Maybe the caller should be able to explicitly set this, but + * this should cover all the 3-channel formats. + */ + if (((src_width / 24) * 24 == src_width) && + util_is_power_of_two(src_width / 24)) { + LLVMSetAlignment(res, src_width / 24); + } else { + LLVMSetAlignment(res, 1); + } + } + + assert(src_width <= dst_type.width * dst_type.length); + if (src_width < dst_type.width * dst_type.length) { + if (dst_type.length > 1) { + res = lp_build_pad_vector(gallivm, res, dst_type.length); + /* + * vector_justify hopefully a non-issue since we only deal + * with src_width >= 32 here? + */ + } else { + /* + * Only valid if src_ptr_type is int type... + */ + res = LLVMBuildZExt(gallivm->builder, res, + lp_build_vec_type(gallivm, dst_type), ""); + if (vector_justify) { +#ifdef PIPE_ARCH_BIG_ENDIAN + res = LLVMBuildShl(gallivm->builder, res, + LLVMConstInt(dst_elem_type, + dst_type.width - src_width, 0), ""); +#endif + } + } + } + return res; +} + + + + static LLVMValueRef lp_build_gather_avx2(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, LLVMValueRef base_ptr, LLVMValueRef offsets) { LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef dst_type = LLVMIntTypeInContext(gallivm->context, dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_type, length); - LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); - LLVMTypeRef src_vec_type = LLVMVectorType(src_type, length); + LLVMTypeRef src_type, src_vec_type; LLVMValueRef res; + struct lp_type res_type = dst_type; + res_type.length *= length; + if (dst_type.floating) { + src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : + LLVMFloatTypeInContext(gallivm->context); + } else { + src_type = LLVMIntTypeInContext(gallivm->context, src_width); + } + src_vec_type = LLVMVectorType(src_type, length); + + /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); if (0) { /* * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but - * will not use the AVX2 gather instrinsics. See + * will not use the AVX2 gather instrinsics (even with llvm 4.0), at + * least with Haswell. See * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html + * And the generated code doing the emulation is quite a bit worse + * than what we get by doing it ourselves too. */ LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); @@ -176,7 +299,8 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); char intrinsic[64]; - util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%ui%u", length, src_width); + util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", + length, dst_type.floating ? "f" : "i", src_width); LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); LLVMValueRef passthru = LLVMGetUndef(src_vec_type); @@ -185,27 +309,36 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); } else { - assert(src_width == 32); - LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); - - /* - * We should get the caller to give more type information so we can use - * the intrinsics for the right int/float domain. Int should be the most - * common. - */ const char *intrinsic = NULL; - switch (length) { - case 4: - intrinsic = "llvm.x86.avx2.gather.d.d"; - break; - case 8: - intrinsic = "llvm.x86.avx2.gather.d.d.256"; - break; - default: - assert(0); + unsigned l_idx = 0; + + assert(src_width == 32 || src_width == 64); + if (src_width == 32) { + assert(length == 4 || length == 8); + } else { + assert(length == 2 || length == 4); } + static const char *intrinsics[2][2][2] = { + + {{"llvm.x86.avx2.gather.d.d", + "llvm.x86.avx2.gather.d.d.256"}, + {"llvm.x86.avx2.gather.d.q", + "llvm.x86.avx2.gather.d.q.256"}}, + + {{"llvm.x86.avx2.gather.d.ps", + "llvm.x86.avx2.gather.d.ps.256"}, + {"llvm.x86.avx2.gather.d.pd", + "llvm.x86.avx2.gather.d.pd.256"}}, + }; + + if ((src_width == 32 && length == 8) || + (src_width == 64 && length == 4)) { + l_idx = 1; + } + intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; + LLVMValueRef passthru = LLVMGetUndef(src_vec_type); LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); mask = LLVMConstBitCast(mask, src_vec_type); @@ -215,12 +348,7 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); } - - if (src_width > dst_width) { - res = LLVMBuildTrunc(builder, res, dst_vec_type, ""); - } else if (src_width < dst_width) { - res = LLVMBuildZExt(builder, res, dst_vec_type, ""); - } + res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); return res; } @@ -241,9 +369,11 @@ lp_build_gather_avx2(struct gallivm_state *gallivm, * * @param length length of the offsets * @param src_width src element width in bits - * @param dst_width result element width in bits (src will be expanded to fit) + * @param dst_type result element type (src will be expanded to fit, + * but truncation is not allowed) + * (this may be a vector, must be pot sized) * @param aligned whether the data is guaranteed to be aligned (to src_width) - * @param base_ptr base pointer, should be a i8 pointer type. + * @param base_ptr base pointer, needs to be a i8 pointer type. * @param offsets vector with offsets * @param vector_justify select vector rather than integer justification */ @@ -251,41 +381,121 @@ LLVMValueRef lp_build_gather(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, boolean vector_justify) { LLVMValueRef res; + boolean need_expansion = src_width < dst_type.width * dst_type.length; + boolean vec_fetch; + struct lp_type fetch_type, fetch_dst_type; + LLVMTypeRef src_type; + + assert(src_width <= dst_type.width * dst_type.length); + + /* + * This is quite a mess... + * Figure out if the fetch should be done as: + * a) scalar or vector + * b) float or int + * + * As an example, for a 96bit fetch expanded into 4x32bit, it is better + * to use (3x32bit) vector type (then pad the vector). Otherwise, the + * zext will cause extra instructions. + * However, the same isn't true for 3x16bit (the codegen for that is + * completely worthless on x86 simd, and for 3x8bit is is way worse + * still, don't try that... (To get really good code out of llvm for + * these cases, the only way is to decompose the fetches manually + * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter + * case requires sse41, otherwise simple scalar zext is way better. + * But probably not important enough, so don't bother.) + * Also, we try to honor the floating bit of destination (but isn't + * possible if caller asks for instance for 2x32bit dst_type with + * 48bit fetch - the idea would be to use 3x16bit fetch, pad and + * cast to 2x32f type, so the fetch is always int and on top of that + * we avoid the vec pad and use scalar zext due the above mentioned + * issue). + * Note this is optimized for x86 sse2 and up backend. Could be tweaked + * for other archs if necessary... + */ + if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) && + (dst_type.length > 1)) { + /* use vector fetch (if dst_type is vector) */ + vec_fetch = TRUE; + if (dst_type.floating) { + fetch_type = lp_type_float_vec(dst_type.width, src_width); + } else { + fetch_type = lp_type_int_vec(dst_type.width, src_width); + } + /* intentionally not using lp_build_vec_type here */ + src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type), + fetch_type.length); + fetch_dst_type = fetch_type; + fetch_dst_type.length = dst_type.length; + } else { + /* use scalar fetch */ + vec_fetch = FALSE; + if (dst_type.floating && ((src_width == 32) || (src_width == 64))) { + fetch_type = lp_type_float(src_width); + } else { + fetch_type = lp_type_int(src_width); + } + src_type = lp_build_vec_type(gallivm, fetch_type); + fetch_dst_type = fetch_type; + fetch_dst_type.width = dst_type.width * dst_type.length; + } if (length == 1) { /* Scalar */ - return lp_build_gather_elem(gallivm, length, - src_width, dst_width, aligned, - base_ptr, offsets, 0, vector_justify); - } else if (util_cpu_caps.has_avx2 && src_width == 32 && (length == 4 || length == 8)) { - return lp_build_gather_avx2(gallivm, length, src_width, dst_width, base_ptr, offsets); + res = lp_build_gather_elem_vec(gallivm, length, + src_width, src_type, fetch_dst_type, + aligned, base_ptr, offsets, 0, + vector_justify); + return LLVMBuildBitCast(gallivm->builder, res, + lp_build_vec_type(gallivm, dst_type), ""); + /* + * Excluding expansion from these paths because if you need it for + * 32bit/64bit fetches you're doing it wrong (this is gather, not + * conversion) and it would be awkward for floats. + */ + } else if (util_cpu_caps.has_avx2 && !need_expansion && + src_width == 32 && (length == 4 || length == 8)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); + /* + * This looks bad on paper wrt throughtput/latency on Haswell. + * Even on Broadwell it doesn't look stellar. + * Albeit no measurements were done (but tested to work). + * Should definitely enable on Skylake. + * (In general, should be more of a win if the fetch is 256bit wide - + * this is true for the 32bit case above too.) + */ + } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && + src_width == 64 && (length == 2 || length == 4)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); } else { /* Vector */ - LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); - LLVMTypeRef gather_vec_type = dst_vec_type; + LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8]; unsigned i; boolean vec_zext = FALSE; - unsigned gather_width = dst_width; + struct lp_type res_type, gather_res_type; + LLVMTypeRef res_t, gather_res_t; + res_type = fetch_dst_type; + res_type.length *= length; + gather_res_type = res_type; - if (src_width == 16 && dst_width == 32) { - LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2); - gather_vec_type = LLVMVectorType(g_elem_type, length); + if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) { /* * Note that llvm is never able to optimize zext/insert combos * directly (i.e. zero the simd reg, then place the elements into - * the appropriate place directly). And 16->32bit zext simd loads + * the appropriate place directly). (I think this has to do with + * scalar/vector transition.) And scalar 16->32bit zext simd loads * aren't possible (instead loading to scalar reg first). - * (I think this has to do with scalar/vector transition.) * No idea about other archs... * We could do this manually, but instead we just use a vector * zext, which is simple enough (and, in fact, llvm might optimize @@ -293,30 +503,53 @@ lp_build_gather(struct gallivm_state *gallivm, * (We're not trying that with other bit widths as that might not be * easier, in particular with 8 bit values at least with only sse2.) */ + assert(vec_fetch == FALSE); + gather_res_type.width /= 2; + fetch_dst_type = fetch_type; + src_type = lp_build_vec_type(gallivm, fetch_type); vec_zext = TRUE; - gather_width = 16; } - res = LLVMGetUndef(gather_vec_type); + res_t = lp_build_vec_type(gallivm, res_type); + gather_res_t = lp_build_vec_type(gallivm, gather_res_type); + res = LLVMGetUndef(gather_res_t); for (i = 0; i < length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); - LLVMValueRef elem; - elem = lp_build_gather_elem(gallivm, length, - src_width, gather_width, aligned, - base_ptr, offsets, i, vector_justify); - res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, ""); + elems[i] = lp_build_gather_elem_vec(gallivm, length, + src_width, src_type, fetch_dst_type, + aligned, base_ptr, offsets, i, + vector_justify); + if (!vec_fetch) { + res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, ""); + } } if (vec_zext) { - res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, ""); + res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); if (vector_justify) { #if PIPE_ARCH_BIG_ENDIAN - struct lp_type dst_type; - unsigned sv = dst_width - src_width; - dst_type = lp_type_uint_vec(dst_width, dst_width * length); + unsigned sv = dst_type.width - src_width; res = LLVMBuildShl(gallivm->builder, res, - lp_build_const_int_vec(gallivm, dst_type, sv), ""); + lp_build_const_int_vec(gallivm, res_type, sv), ""); #endif } } + if (vec_fetch) { + /* + * Do bitcast now otherwise llvm might get some funny ideas wrt + * float/int types... + */ + for (i = 0; i < length; i++) { + elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i], + lp_build_vec_type(gallivm, dst_type), ""); + } + res = lp_build_concat(gallivm, elems, dst_type, length); + } else { + struct lp_type really_final_type = dst_type; + assert(res_type.length * res_type.width == + dst_type.length * dst_type.width * length); + really_final_type.length *= length; + res = LLVMBuildBitCast(gallivm->builder, res, + lp_build_vec_type(gallivm, really_final_type), ""); + } } return res; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h index 3ede4763a70..7930864e611 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_gather.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h @@ -55,7 +55,7 @@ LLVMValueRef lp_build_gather(struct gallivm_state *gallivm, unsigned length, unsigned src_width, - unsigned dst_width, + struct lp_type dst_type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offsets, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index f91b761dc11..c46749dbac8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -579,10 +579,12 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef rgba8; struct lp_build_context u8n; LLVMTypeRef u8n_vec_type; + struct lp_type fetch_type; lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); + fetch_type = lp_type_uint(bld->texel_type.width); if (util_format_is_rgba8_variant(bld->format_desc)) { /* * Given the format is a rgba8, just read the pixels as is, @@ -591,7 +593,7 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, - bld->texel_type.width, + fetch_type, TRUE, data_ptr, offset, TRUE); @@ -925,14 +927,16 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, LLVMValueRef rgba8; if (util_format_is_rgba8_variant(bld->format_desc)) { + struct lp_type fetch_type; /* * Given the format is a rgba8, just read the pixels as is, * without any swizzling. Swizzling will be done later. */ + fetch_type = lp_type_uint(bld->texel_type.width); rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, - bld->texel_type.width, + fetch_type, TRUE, data_ptr, offset[k][j][i], TRUE); |