diff options
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 116 |
1 files changed, 97 insertions, 19 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 322e7b817db..574bb64c917 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -38,6 +38,7 @@ #include "util/u_math.h" #include "util/u_pointer.h" #include "util/u_string.h" +#include "util/u_cpu_detect.h" #include "lp_bld_arit.h" #include "lp_bld_init.h" @@ -49,6 +50,7 @@ #include "lp_bld_gather.h" #include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_pack.h" #include "lp_bld_intr.h" @@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; + LLVMTypeRef vec32_type; boolean normalized; boolean needs_uitofp; @@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, * matches floating point size */ assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + /* Broadcast the packed value to all four channels * before: packed = BGRA * after: packed = {BGRA, BGRA, BGRA, BGRA} */ - packed = LLVMBuildInsertElement(builder, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - packed, + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), ""); - packed = LLVMBuildShuffleVector(builder, - packed, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), + LLVMConstNull(vec32_type), ""); /* Initialize vector constants */ @@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} * into masked = {X, Y, Z, W} */ - /* Note: we cannot do this shift on x86 natively until AVX2. */ - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + if (desc->block.bits < 32 && normalized) { + /* + * Note: we cannot do the shift below on x86 natively until AVX2. + * + * Old llvm versions will resort to scalar extract/shift insert, + * which is definitely terrible, new versions will just do + * several vector shifts and shuffle/blend results together. + * We could turn this into a variable left shift plus a constant + * right shift, and llvm would then turn the variable left shift + * into a mul for us (albeit without sse41 the mul needs emulation + * too...). However, since we're going to do a float mul + * anyway, we just adjust that mul instead (plus the mask), skipping + * the shift completely. + * We could also use a extra mul when the format isn't normalized and + * we don't have AVX2 support, but don't bother for now. Unfortunately, + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even + * rgba8 if it ends up here), as that would require UIToFP, albeit that + * would be fixable with easy 16bit shuffle (unless there's channels + * crossing 16bit boundaries). + */ + for (i = 0; i < 4; ++i) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; + unsigned long long mask = ((1ULL << bits) - 1) << shift; + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); + masks[i] = lp_build_const_int32(gallivm, mask); + } + } + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); + } else { + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + } if (!needs_uitofp) { /* UIToFP can't be expressed in SSE2 */ @@ -235,8 +267,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); } - /* At this point 'casted' may be a vector of floats such as - * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized + /* + * At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied + * by powers of two). Next, if the pixel values are normalized * we'll scale this to {1.0, 1.0, 1.0, 1.0}. */ @@ -392,6 +426,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); @@ -424,6 +459,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && @@ -433,8 +469,24 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, !format_desc->channel[0].pure_integer) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; - LLVMValueRef res; - unsigned k; + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; + struct lp_type conv_type; + unsigned k, num_conv_src, num_conv_dst; + + /* + * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some + * 888(8) variant), so things like rgb565. This is _really_ suboptimal. + * Not only do we a single pixel at a time but we convert to float, + * do a normalize mul, un-normalize mul, convert back to int, finally pack + * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos + * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good + * measure. (And if we're not extra careful we get some pointless min/max + * too for clamping values to range). This is a disaster of epic proportions, + * simply forcing SoA sampling would be way faster (even when we don't have + * AVX support). + * We should make sure we cannot hit this code path for anything but single + * pixels. + */ /* * Unpack a pixel at a time into a <4 x float> RGBA vector @@ -464,12 +516,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, __FUNCTION__, format_desc->short_name); } - lp_build_conv(gallivm, - lp_float32_vec4_type(), - type, - tmps, num_pixels, &res, 1); + conv_type = lp_float32_vec4_type(); + num_conv_src = num_pixels; + num_conv_dst = 1; + + if (num_pixels % 8 == 0) { + lp_build_concat_n(gallivm, lp_float32_vec4_type(), + tmps, num_pixels, tmps, num_pixels / 2); + conv_type.length *= num_pixels / 4; + num_conv_src = 4 * num_pixels / 8; + if (type.width == 8 && type.floating == 0 && type.fixed == 0) { + /* + * FIXME: The fast float->unorm path (which is basically + * skipping the MIN/MAX which are extremely pointless in any + * case) requires that there's 2 destinations... + * In any case, we really should make sure we don't hit this + * code with multiple pixels for unorm8 dst types, it's + * completely hopeless even if we do hit the right conversion. + */ + type.length /= num_pixels / 4; + num_conv_dst = num_pixels / 4; + } + } + + lp_build_conv(gallivm, conv_type, type, + tmps, num_conv_src, res, num_conv_dst); + + if (num_pixels % 8 == 0 && + (type.width == 8 && type.floating == 0 && type.fixed == 0)) { + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); + } - return lp_build_format_swizzle_aos(format_desc, &bld, res); + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); } /* If all channels are of same type and we are not using half-floats */ |