diff options
author | Roland Scheidegger <[email protected]> | 2016-12-21 04:43:07 +0100 |
---|---|---|
committer | Roland Scheidegger <[email protected]> | 2016-12-21 04:48:24 +0100 |
commit | e827d9175675aaa6cfc0b981e2a80685fb7b3a74 (patch) | |
tree | b90389b09b57b6f151e114cca123256795121ed6 | |
parent | cb81460dcc61da0fb5ce066ee435c56840c0aba3 (diff) |
draw: use SoA fetch, not AoS one
Now that there's some SoA fetch which never falls back, we should always get
results which are better or at least not worse (something like rgba32f will
stay the same).
For cases which get way better, think something like R16_UNORM with 8-wide
vectors: this was 8 sign-extend fetches, 8 cvt, 8 muls, followed by
a couple of shuffles to stitch things together (if it is smart enough,
6 unpacks) and then a (8-wide) transpose (not sure if llvm could even
optimize the shuffles + transpose, since the 16bit values were actually
sign-extended to 128bit before being cast to a float vec, so that would be
another 8 unpacks). Now that is just 8 fetches (directly inserted into
vector, albeit there's one 128bit insert needed), 1 cvt, 1 mul.
v2: ditch the old AoS code instead of just disabling it.
Reviewed-by: Jose Fonseca <[email protected]>
-rw-r--r-- | src/gallium/auxiliary/draw/draw_llvm.c | 71 |
1 files changed, 23 insertions, 48 deletions
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 19b75a5003b..8952dc8d3ba 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -713,39 +713,6 @@ fetch_instanced(struct gallivm_state *gallivm, static void -convert_to_soa(struct gallivm_state *gallivm, - LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32], - LLVMValueRef dst_soa[TGSI_NUM_CHANNELS], - const struct lp_type soa_type) -{ - unsigned j, k; - struct lp_type aos_channel_type = soa_type; - - LLVMValueRef aos_channels[TGSI_NUM_CHANNELS]; - unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS; - - debug_assert(TGSI_NUM_CHANNELS == 4); - debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0); - - aos_channel_type.length >>= 1; - - for (j = 0; j < TGSI_NUM_CHANNELS; ++j) { - LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 }; - - assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); - - for (k = 0; k < pixels_per_channel; ++k) { - channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k]; - } - - aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); - } - - lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa); -} - - -static void fetch_vector(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type vs_type, @@ -755,11 +722,10 @@ fetch_vector(struct gallivm_state *gallivm, LLVMValueRef *inputs, LLVMValueRef indices) { - LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); LLVMBuilderRef builder = gallivm->builder; struct lp_build_context blduivec; + struct lp_type fetch_type = vs_type; LLVMValueRef offset, valid_mask; - LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; unsigned i; lp_build_context_init(&blduivec, gallivm, lp_uint_type(vs_type)); @@ -783,28 +749,37 @@ fetch_vector(struct gallivm_state *gallivm, } /* - * Note: we probably really want to use SoA fetch, not AoS one (albeit - * for most formats it will amount to the same as this isn't very - * optimized). But looks dangerous since it assumes alignment. + * Unlike fetch_instanced, use SoA fetch instead of multiple AoS fetches. + * This should always produce better code. */ - for (i = 0; i < vs_type.length; i++) { - LLVMValueRef offset1, elem; - elem = lp_build_const_int32(gallivm, i); - offset1 = LLVMBuildExtractElement(builder, offset, elem, ""); - aos_fetch[i] = lp_build_fetch_rgba_aos(gallivm, format_desc, - lp_float32_vec4_type(), - FALSE, map_ptr, offset1, - zero, zero, NULL); + /* The type handling is annoying here... */ + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + format_desc->channel[0].pure_integer) { + if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) { + fetch_type = lp_type_int_vec(vs_type.width, vs_type.width * vs_type.length); + } + else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) { + fetch_type = lp_type_uint_vec(vs_type.width, vs_type.width * vs_type.length); + } } - convert_to_soa(gallivm, aos_fetch, inputs, vs_type); + + lp_build_fetch_rgba_soa(gallivm, format_desc, + fetch_type, FALSE, map_ptr, offset, + blduivec.zero, blduivec.zero, + NULL, inputs); for (i = 0; i < TGSI_NUM_CHANNELS; i++) { + inputs[i] = LLVMBuildBitCast(builder, inputs[i], + lp_build_vec_type(gallivm, vs_type), ""); + } + + /* out-of-bound fetches return all zeros */ + for (i = 0; i < TGSI_NUM_CHANNELS; i++) { inputs[i] = LLVMBuildBitCast(builder, inputs[i], blduivec.vec_type, ""); inputs[i] = LLVMBuildAnd(builder, inputs[i], valid_mask, ""); inputs[i] = LLVMBuildBitCast(builder, inputs[i], lp_build_vec_type(gallivm, vs_type), ""); - } } |