diff options
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 529 |
1 files changed, 375 insertions, 154 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index b3bc15552c5..34da8698b85 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -31,6 +31,7 @@ #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_string.h" +#include "util/u_math.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -113,6 +114,166 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, } + +static LLVMValueRef +lp_build_extract_soa_chan(struct lp_build_context *bld, + unsigned blockbits, + boolean srgb_chan, + struct util_format_channel_description chan_desc, + LLVMValueRef packed) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type type = bld->type; + LLVMValueRef input = packed; + const unsigned width = chan_desc.size; + const unsigned start = chan_desc.shift; + const unsigned stop = start + width; + + /* Decode the input vector component */ + + switch(chan_desc.type) { + case UTIL_FORMAT_TYPE_VOID: + input = bld->undef; + break; + + case UTIL_FORMAT_TYPE_UNSIGNED: + /* + * Align the LSB + */ + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + + /* + * Zero the MSBs + */ + if (stop < blockbits) { + unsigned mask = ((unsigned long long)1 << width) - 1; + input = LLVMBuildAnd(builder, input, + lp_build_const_int_vec(gallivm, type, mask), ""); + } + + /* + * Type conversion + */ + if (type.floating) { + if (srgb_chan) { + struct lp_type conv_type = lp_uint_type(type); + input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); + } + else { + if(chan_desc.normalized) + input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); + else + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_SIGNED: + /* + * Align the sign bit first. + */ + if (stop < type.width) { + unsigned bits = type.width - stop; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildShl(builder, input, bits_val, ""); + } + + /* + * Align the LSB (with an arithmetic shift to preserve the sign) + */ + if (chan_desc.size < type.width) { + unsigned bits = type.width - chan_desc.size; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildAShr(builder, input, bits_val, ""); + } + + /* + * Type conversion + */ + if (type.floating) { + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + if (chan_desc.normalized) { + double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildFMul(builder, input, scale_val, ""); + /* + * The formula above will produce value below -1.0 for most negative + * value but everything seems happy with that hence disable for now. + */ + if (0) + input = lp_build_max(bld, input, + lp_build_const_vec(gallivm, type, -1.0f)); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_FLOAT: + if (type.floating) { + if (chan_desc.size == 16) { + struct lp_type f16i_type = type; + f16i_type.width /= 2; + f16i_type.floating = 0; + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + input = LLVMBuildTrunc(builder, input, + lp_build_vec_type(gallivm, f16i_type), ""); + input = lp_build_half_to_float(gallivm, input); + } else { + assert(start == 0); + assert(stop == 32); + assert(type.width == 32); + } + input = LLVMBuildBitCast(builder, input, bld->vec_type, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + case UTIL_FORMAT_TYPE_FIXED: + if (type.floating) { + double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + default: + assert(0); + input = bld->undef; + break; + } + + return input; +} + + /** * Unpack several pixels in SoA. * @@ -143,7 +304,6 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef packed, LLVMValueRef rgba_out[4]) { - LLVMBuilderRef builder = gallivm->builder; struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned chan; @@ -159,162 +319,19 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, /* Decode the input vector components */ for (chan = 0; chan < format_desc->nr_channels; ++chan) { - const unsigned width = format_desc->channel[chan].size; - const unsigned start = format_desc->channel[chan].shift; - const unsigned stop = start + width; - LLVMValueRef input; - - input = packed; - - switch(format_desc->channel[chan].type) { - case UTIL_FORMAT_TYPE_VOID: - input = lp_build_undef(gallivm, type); - break; - - case UTIL_FORMAT_TYPE_UNSIGNED: - /* - * Align the LSB - */ - - if (start) { - input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); - } - - /* - * Zero the MSBs - */ - - if (stop < format_desc->block.bits) { - unsigned mask = ((unsigned long long)1 << width) - 1; - input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); - } - - /* - * Type conversion - */ - - if (type.floating) { - if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { - if (format_desc->swizzle[3] == chan) { - input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); - } - else { - struct lp_type conv_type = lp_uint_type(type); - input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); - } - } - else { - if(format_desc->channel[chan].normalized) - input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); - else - input = LLVMBuildSIToFP(builder, input, - lp_build_vec_type(gallivm, type), ""); - } - } - else if (format_desc->channel[chan].pure_integer) { - /* Nothing to do */ - } else { - /* FIXME */ - assert(0); - } - - break; - - case UTIL_FORMAT_TYPE_SIGNED: - /* - * Align the sign bit first. - */ - - if (stop < type.width) { - unsigned bits = type.width - stop; - LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); - input = LLVMBuildShl(builder, input, bits_val, ""); - } + struct util_format_channel_description chan_desc = format_desc->channel[chan]; + boolean srgb_chan = FALSE; - /* - * Align the LSB (with an arithmetic shift to preserve the sign) - */ - - if (format_desc->channel[chan].size < type.width) { - unsigned bits = type.width - format_desc->channel[chan].size; - LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); - input = LLVMBuildAShr(builder, input, bits_val, ""); - } - - /* - * Type conversion - */ - - if (type.floating) { - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); - if (format_desc->channel[chan].normalized) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); - input = LLVMBuildFMul(builder, input, scale_val, ""); - /* the formula above will produce value below -1.0 for most negative - * value but everything seems happy with that hence disable for now */ - if (0) - input = lp_build_max(&bld, input, - lp_build_const_vec(gallivm, type, -1.0f)); - } - } - else if (format_desc->channel[chan].pure_integer) { - /* Nothing to do */ - } else { - /* FIXME */ - assert(0); - } - - break; - - case UTIL_FORMAT_TYPE_FLOAT: - if (type.floating) { - if (format_desc->channel[chan].size == 16) { - struct lp_type f16i_type = type; - f16i_type.width /= 2; - f16i_type.floating = 0; - if (start) { - input = LLVMBuildLShr(builder, input, - lp_build_const_int_vec(gallivm, type, start), ""); - } - input = LLVMBuildTrunc(builder, input, - lp_build_vec_type(gallivm, f16i_type), ""); - input = lp_build_half_to_float(gallivm, input); - } else { - assert(start == 0); - assert(stop == 32); - assert(type.width == 32); - } - input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(gallivm, type); - } - break; - - case UTIL_FORMAT_TYPE_FIXED: - if (type.floating) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); - input = LLVMBuildFMul(builder, input, scale_val, ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(gallivm, type); - } - break; - - default: - assert(0); - input = lp_build_undef(gallivm, type); - break; + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && + format_desc->swizzle[3] != chan) { + srgb_chan = TRUE; } - inputs[chan] = input; + inputs[chan] = lp_build_extract_soa_chan(&bld, + format_desc->block.bits, + srgb_chan, + chan_desc, + packed); } lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); @@ -450,6 +467,210 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, return; } + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits > type.width && + ((format_desc->block.bits <= type.width * type.length && + format_desc->channel[0].size <= type.width) || + (format_desc->channel[0].size == 64 && + format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + type.floating))) + { + /* + * Similar to above, but the packed pixel is larger than what fits + * into an element of the destination format. The packed pixels will be + * shuffled into SoA vectors appropriately, and then the extraction will + * be done in parallel as much as possible. + * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so + * the gathered vectors can be shuffled easily (even with avx). + * 64xn float -> 32xn float is handled too but it's a bit special as + * it does the conversion pre-shuffle. + */ + + LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; + struct lp_type fetch_type, gather_type = type; + unsigned num_gather, fetch_width, i, j; + struct lp_build_context bld; + boolean fp64 = format_desc->channel[0].size == 64; + + lp_build_context_init(&bld, gallivm, type); + + assert(type.width == 32); + assert(format_desc->block.bits > type.width); + + /* + * First, figure out fetch order. + */ + fetch_width = util_next_power_of_two(format_desc->block.bits); + num_gather = fetch_width / type.width; + /* + * fp64 are treated like fp32 except we fetch twice wide values + * (as we shuffle after trunc). The shuffles for that work out + * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) + * albeit we miss the potential opportunity for hw gather (as it + * only handles native size). + */ + num_gather = fetch_width / type.width; + gather_type.width *= num_gather; + if (fp64) { + num_gather /= 2; + } + gather_type.length /= num_gather; + + for (i = 0; i < num_gather; i++) { + LLVMValueRef offsetr, shuf_vec; + if(num_gather == 4) { + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i + 4*j; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); + + } + else if (num_gather == 2) { + assert(num_gather == 2); + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i*2 + (j%2) + (j/2)*4; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); + } + else { + assert(num_gather == 1); + offsetr = offset; + } + if (gather_type.length == 1) { + LLVMValueRef zero = lp_build_const_int32(gallivm, 0); + offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); + } + + /* + * Determine whether to use float or int loads. This is mostly + * to outsmart the (stupid) llvm int/float shuffle logic, we + * don't really care much if the data is floats or ints... + * But llvm will refuse to use single float shuffle with int data + * and instead use 3 int shuffles instead, the code looks atrocious. + * (Note bitcasts often won't help, as llvm is too smart to be + * fooled by that.) + * Nobody cares about simd float<->int domain transition penalties, + * which usually don't even exist for shuffles anyway. + * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is + * going into transpose, which is unpacks, so doesn't really matter + * much). + * With 2x32bit or 4x16bit fetch, we use float vec, since those + * go into the weird channel separation shuffle. With floats, + * this is (with 128bit vectors): + * - 2 movq, 2 movhpd, 2 shufps + * With ints it would be: + * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw + * I've seen texture functions increase in code size by 15% just due + * to that (there's lots of such fetches in them...) + * (We could chose a different gather order to improve this somewhat + * for the int path, but it would basically just drop the blends, + * so the float path with this order really is optimal.) + * Albeit it is tricky sometimes llvm doesn't ignore the float->int + * casts so must avoid them until we're done with the float shuffle... + * 3x16bit formats (the same is also true for 3x8) are pretty bad but + * there's nothing we can do about them (we could overallocate by + * those couple bytes and use unaligned but pot sized load). + * Note that this is very much x86 specific. I don't know if this + * affect other archs at all. + */ + if (num_gather > 1) { + /* + * We always want some float type here (with x86) + * due to shuffles being float ones afterwards (albeit for + * the num_gather == 4 case int should work fine too + * (unless there's some problems with avx but not avx2). + */ + if (format_desc->channel[0].size == 64) { + fetch_type = lp_type_float_vec(64, gather_type.width); + } else { + fetch_type = lp_type_int_vec(32, gather_type.width); + } + } + else { + /* type doesn't matter much */ + if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + (format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 64)) { + fetch_type = lp_type_float(gather_type.width); + } else { + fetch_type = lp_type_uint(gather_type.width); + } + } + + /* Now finally gather the values */ + packed[i] = lp_build_gather(gallivm, gather_type.length, + format_desc->block.bits, + fetch_type, aligned, + base_ptr, offsetr, FALSE); + if (fp64) { + struct lp_type conv_type = type; + conv_type.width *= 2; + packed[i] = LLVMBuildBitCast(builder, packed[i], + lp_build_vec_type(gallivm, conv_type), ""); + packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); + } + } + + /* shuffle the gathered values to SoA */ + if (num_gather == 2) { + for (i = 0; i < num_gather; i++) { + for (j = 0; j < type.length; j++) { + unsigned idx = (j%2)*2 + (j/4)*4 + i; + if ((j/2)%2) + idx += type.length; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], + LLVMConstVector(shuffles, type.length), ""); + } + } + else if (num_gather == 4) { + lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); + } + else { + assert(num_gather == 1); + dst[0] = packed[0]; + } + + /* + * And finally unpack exactly as above, except that + * chan shift is adjusted and the right vector selected. + */ + if (!fp64) { + for (i = 0; i < num_gather; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); + } + for (i = 0; i < format_desc->nr_channels; i++) { + struct util_format_channel_description chan_desc = format_desc->channel[i]; + unsigned blockbits = type.width; + unsigned vec_nr = chan_desc.shift / type.width; + chan_desc.shift %= type.width; + + output[i] = lp_build_extract_soa_chan(&bld, + blockbits, + FALSE, + chan_desc, + dst[vec_nr]); + } + } + else { + for (i = 0; i < format_desc->nr_channels; i++) { + output[i] = dst[i]; + } + } + + lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); + return; + } + if (format == PIPE_FORMAT_R11G11B10_FLOAT || format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* |