diff options
author | Roland Scheidegger <[email protected]> | 2016-12-06 19:59:51 +0100 |
---|---|---|
committer | Roland Scheidegger <[email protected]> | 2016-12-06 20:06:06 +0100 |
commit | 8ac3c1bf1ab47d9b7e5fcddeb7620eedcec7bdb3 (patch) | |
tree | 0071383a12fa4cecd96d7002baefacbea662ca7a /src | |
parent | fd5f420fbb237a9662532a4111e409f5ec2eba8a (diff) |
gallivm: optimize 16bit->32bit gather path a bit
LLVM can't really optimize anything which crosses scalar/vector boundaries,
so help a bit with some particular gather operations when the width is
expanded (only do it for 16->32bit expansion for now), by doing expansion
after fetch. That is probably a better solution anyway even if llvm would
recognize it, makes for cleaner IR...
Reviewed-by: Jose Fonseca <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_gather.c | 42 |
1 files changed, 39 insertions, 3 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c index 439bbb6798f..1f7ba927bc4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c @@ -33,6 +33,7 @@ #include "lp_bld_format.h" #include "lp_bld_gather.h" #include "lp_bld_swizzle.h" +#include "lp_bld_type.h" #include "lp_bld_init.h" #include "lp_bld_intr.h" @@ -270,17 +271,52 @@ lp_build_gather(struct gallivm_state *gallivm, LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); + LLVMTypeRef gather_vec_type = dst_vec_type; unsigned i; - - res = LLVMGetUndef(dst_vec_type); + boolean vec_zext = FALSE; + unsigned gather_width = dst_width; + + + if (src_width == 16 && dst_width == 32) { + LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width / 2); + gather_vec_type = LLVMVectorType(g_elem_type, length); + /* + * Note that llvm is never able to optimize zext/insert combos + * directly (i.e. zero the simd reg, then place the elements into + * the appropriate place directly). And 16->32bit zext simd loads + * aren't possible (instead loading to scalar reg first). + * (I think this has to do with scalar/vector transition.) + * No idea about other archs... + * We could do this manually, but instead we just use a vector + * zext, which is simple enough (and, in fact, llvm might optimize + * this away). + * (We're not trying that with other bit widths as that might not be + * easier, in particular with 8 bit values at least with only sse2.) + */ + vec_zext = TRUE; + gather_width = 16; + } + res = LLVMGetUndef(gather_vec_type); for (i = 0; i < length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); LLVMValueRef elem; elem = lp_build_gather_elem(gallivm, length, - src_width, dst_width, aligned, + src_width, gather_width, aligned, base_ptr, offsets, i, vector_justify); res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, ""); } + if (vec_zext) { + res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, ""); + if (vector_justify) { +#if PIPE_ARCH_BIG_ENDIAN + struct lp_type dst_type; + unsigned sv = dst_width - src_width; + dst_type = lp_type_uint_vec(dst_width, dst_width * length); + res = LLVMBuildShl(gallivm->builder, res, + lp_build_const_int_vec(gallivm, dst_type, sv), ""); +#endif + } + } } return res; |