diff options
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_arit.c | 14 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_pack.c | 139 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_pack.h | 16 |
3 files changed, 156 insertions, 13 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index f5cacc460f2..3ea0734331d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1046,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld, struct lp_type wide_type = lp_wider_type(type); LLVMValueRef al, ah, bl, bh, abl, abh, ab; - lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah); - lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh); + lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); + lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); /* PMULLW, PSRLW, PADDW */ abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); - ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh); + ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); return ab; } @@ -1350,9 +1350,9 @@ lp_build_lerp(struct lp_build_context *bld, lp_build_context_init(&wide_bld, bld->gallivm, wide_type); - lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh); - lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h); - lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h); + lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); + lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); + lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); /* * Lerp both halves. @@ -1363,7 +1363,7 @@ lp_build_lerp(struct lp_build_context *bld, resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); - res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh); + res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); } else { res = lp_build_lerp_simple(bld, x, v0, v1, flags); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index b0e76e6465d..e8d4fcdf2fb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -346,10 +346,10 @@ lp_build_interleave2(struct gallivm_state *gallivm, */ LLVMValueRef lp_build_interleave2_half(struct gallivm_state *gallivm, - struct lp_type type, - LLVMValueRef a, - LLVMValueRef b, - unsigned lo_hi) + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi) { if (type.length * type.width == 256) { LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi); @@ -359,11 +359,13 @@ lp_build_interleave2_half(struct gallivm_state *gallivm, } } + /** * Double the bit width. * * This will only change the number of bits the values are represented, not the * values themselves. + * */ void lp_build_unpack2(struct gallivm_state *gallivm, @@ -394,6 +396,65 @@ lp_build_unpack2(struct gallivm_state *gallivm, #ifdef PIPE_ARCH_LITTLE_ENDIAN *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); + +#else + *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); + *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); +#endif + + /* Cast the result into the new type (twice as wide) */ + + dst_vec_type = lp_build_vec_type(gallivm, dst_type); + + *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); + *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); +} + + +/** + * Double the bit width, with an order which fits the cpu nicely. + * + * This will only change the number of bits the values are represented, not the + * values themselves. + * + * The order of the results is not guaranteed, other than it will match + * the corresponding lp_build_pack2_native call. + */ +void +lp_build_unpack2_native(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef src, + LLVMValueRef *dst_lo, + LLVMValueRef *dst_hi) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef msb; + LLVMTypeRef dst_vec_type; + + assert(!src_type.floating); + assert(!dst_type.floating); + assert(dst_type.width == src_type.width * 2); + assert(dst_type.length * 2 == src_type.length); + + if(dst_type.sign && src_type.sign) { + /* Replicate the sign bit in the most significant bits */ + msb = LLVMBuildAShr(builder, src, + lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), ""); + } + else + /* Most significant bits always zero */ + msb = lp_build_zero(gallivm, src_type); + + /* Interleave bits */ +#ifdef PIPE_ARCH_LITTLE_ENDIAN + if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) { + *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0); + *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1); + } else { + *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); + *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); + } #else *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); @@ -440,7 +501,8 @@ lp_build_unpack(struct gallivm_state *gallivm, tmp_type.length /= 2; for(i = num_tmps; i--; ) { - lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]); + lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], + &dst[2*i + 1]); } src_type = tmp_type; @@ -605,6 +667,70 @@ lp_build_pack2(struct gallivm_state *gallivm, } +/** + * Non-interleaved native pack. + * + * Similar to lp_build_pack2, but the ordering of values is not + * guaranteed, other than it will match lp_build_unpack2_native. + * + * In particular, with avx2, the lower and upper 128bits of the vectors will + * be packed independently, so that (with 32bit->16bit values) + * (LSB) (MSB) + * lo = l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __ + * hi = h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __ + * res = l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7 + * + * This will only change the number of bits the values are represented, not the + * values themselves. + * + * It is assumed the values are already clamped into the destination type range. + * Values outside that range will produce undefined results. + */ +LLVMValueRef +lp_build_pack2_native(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef lo, + LLVMValueRef hi) +{ + LLVMBuilderRef builder = gallivm->builder; + struct lp_type intr_type = dst_type; + const char *intrinsic = NULL; + + assert(!src_type.floating); + assert(!dst_type.floating); + assert(src_type.width == dst_type.width * 2); + assert(src_type.length * 2 == dst_type.length); + + /* At this point only have special case for avx2 */ + if (src_type.length * src_type.width == 256 && + util_cpu_caps.has_avx2) { + switch(src_type.width) { + case 32: + if (dst_type.sign) { + intrinsic = "llvm.x86.avx2.packssdw"; + } else { + intrinsic = "llvm.x86.avx2.packusdw"; + } + break; + case 16: + if (dst_type.sign) { + intrinsic = "llvm.x86.avx2.packsswb"; + } else { + intrinsic = "llvm.x86.avx2.packuswb"; + } + break; + } + } + if (intrinsic) { + LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type); + return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, + lo, hi); + } + else { + return lp_build_pack2(gallivm, src_type, dst_type, lo, hi); + } +} /** * Non-interleaved pack and saturate. @@ -640,7 +766,8 @@ lp_build_packs2(struct gallivm_state *gallivm, if(clamp) { struct lp_build_context bld; unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width; - LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1); + LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, + ((unsigned long long)1 << dst_bits) - 1); lp_build_context_init(&bld, gallivm, src_type); lo = lp_build_min(&bld, lo, dst_max); hi = lp_build_min(&bld, hi, dst_max); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index 367fba1fd21..3e07716dfe3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -74,6 +74,14 @@ lp_build_unpack2(struct gallivm_state *gallivm, void +lp_build_unpack2_native(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef src, + LLVMValueRef *dst_lo, + LLVMValueRef *dst_hi); + +void lp_build_unpack(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, @@ -118,6 +126,14 @@ lp_build_pack2(struct gallivm_state *gallivm, LLVMValueRef +lp_build_pack2_native(struct gallivm_state *gallivm, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef lo, + LLVMValueRef hi); + + +LLVMValueRef lp_build_pack(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, |