From 44993c1808dc10da36cac156408a10e05b5dc140 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Wed, 5 Jun 2013 00:11:45 +0200 Subject: gallivm: enhance special sse2 4x4f and 2x8f -> 1x16ub conversion There's no good reason why it can't handle 2x4f->1x8ub, 1x4f->1x4ub and 1x8f->1x8ub cases, there might be legitimate reasons why we don't have enough input vectors for a full destination vector, and using pack intrinsics should still be much better than using generic conversion (it looks like convert_alpha from the blend code might hit this though I suspect it could be avoided). v2: add another test vector format to lp_test_conv so this gets tested. Reviewed-by: Jose Fonseca --- src/gallium/auxiliary/gallivm/lp_bld_conv.c | 90 +++++++++++++++++++---------- 1 file changed, 58 insertions(+), 32 deletions(-) (limited to 'src/gallium/auxiliary/gallivm') diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index eb2d09638fa..f11361ac06b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, dst_type->width == 8) { /* Special case 4x4f --> 1x16ub */ - if (src_type.length == 4 && util_cpu_caps.has_sse2) + if (src_type.length == 4 && + util_cpu_caps.has_sse2) { - assert((num_srcs % 4) == 0); - - num_dsts = num_srcs / 4; - dst_type->length = 16; + num_dsts = (num_srcs + 3) / 4; + dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); return num_dsts; } /* Special case 2x8f --> 1x16ub */ - if (src_type.length == 8 && util_cpu_caps.has_avx) + if (src_type.length == 8 && + util_cpu_caps.has_avx) { - assert((num_srcs % 2) == 0); - - num_dsts = num_srcs / 2; - dst_type->length = 16; + num_dsts = (num_srcs + 1) / 2; + dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts); return num_dsts; @@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm, num_tmps = num_srcs; - /* Special case 4x4f --> 1x16ub + /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub */ if (src_type.floating == 1 && src_type.fixed == 0 && @@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm, dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && - dst_type.length == 16 && - 4 * num_dsts == num_srcs && + ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && util_cpu_caps.has_sse2) { struct lp_build_context bld; - struct lp_type int16_type = dst_type; - struct lp_type int32_type = dst_type; + struct lp_type int16_type, int32_type; + struct lp_type dst_type_ext = dst_type; LLVMValueRef const_255f; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); + dst_type_ext.length = 16; + int16_type = int32_type = dst_type_ext; + int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; @@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm, for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; - for (j = 0; j < 4; ++j) { + for (j = 0; j < dst_type.length / 4; ++j) { tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); tmp[j] = lp_build_iround(&bld, tmp[j]); } + if (num_srcs == 1) { + tmp[1] = tmp[0]; + } + /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); - hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); - dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); + + if (num_srcs < 4) { + hi = lo; + } + else { + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + } + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); + } + if (num_srcs < 4) { + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); } return; } - /* Special case 2x8f --> 1x16ub + /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub */ else if (src_type.floating == 1 && src_type.fixed == 0 && @@ -671,20 +685,23 @@ lp_build_conv(struct gallivm_state *gallivm, dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && - dst_type.length == 16 && - 2 * num_dsts == num_srcs && + ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 8)) && util_cpu_caps.has_avx) { struct lp_build_context bld; - struct lp_type int16_type = dst_type; - struct lp_type int32_type = dst_type; + struct lp_type int16_type, int32_type; + struct lp_type dst_type_ext = dst_type; LLVMValueRef const_255f; unsigned i; lp_build_context_init(&bld, gallivm, src_type); + dst_type_ext.length = 16; + int16_type = int32_type = dst_type_ext; + int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; @@ -699,21 +716,30 @@ lp_build_conv(struct gallivm_state *gallivm, LLVMValueRef lo, hi, a, b; a = LLVMBuildFMul(builder, src[0], const_255f, ""); - b = LLVMBuildFMul(builder, src[1], const_255f, ""); - a = lp_build_iround(&bld, a); - b = lp_build_iround(&bld, b); - tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); - tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); - tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); - /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); - hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); - dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); + + if (num_srcs == 1) { + hi = lo; + } + else { + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + b = lp_build_iround(&bld, b); + tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); + tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); + hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); + + } + dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi); } + + if (num_srcs == 1) { + dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length); + } + return; } -- cgit v1.2.3