diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/auxiliary/Makefile.sources | 2 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format.h | 6 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 5 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_cached.c | 374 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c | 2229 | ||||
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 4 | ||||
-rw-r--r-- | src/gallium/auxiliary/meson.build | 2 |
7 files changed, 2239 insertions, 383 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 87a490e555d..50e88088ff8 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -418,11 +418,11 @@ GALLIVM_SOURCES := \ gallivm/lp_bld_flow.h \ gallivm/lp_bld_format_aos_array.c \ gallivm/lp_bld_format_aos.c \ - gallivm/lp_bld_format_cached.c \ gallivm/lp_bld_format_float.c \ gallivm/lp_bld_format.c \ gallivm/lp_bld_format.h \ gallivm/lp_bld_format_soa.c \ + gallivm/lp_bld_format_s3tc.c \ gallivm/lp_bld_format_srgb.c \ gallivm/lp_bld_format_yuv.c \ gallivm/lp_bld_gather.c \ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 6540caaa293..b1e95c4e6db 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -165,8 +165,12 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef j); +/* + * S3TC + */ + LLVMValueRef -lp_build_fetch_cached_texels(struct gallivm_state *gallivm, +lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, unsigned n, LLVMValueRef base_ptr, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index b52acca1b3e..21680dba74a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -464,6 +464,7 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm, * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0, 0). + * \param cache optional value pointing to a lp_build_format_cache structure * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef @@ -728,7 +729,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, * s3tc rgb formats */ - if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { struct lp_type tmp_type; LLVMValueRef tmp; @@ -737,7 +738,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, tmp_type.length = num_pixels * 4; tmp_type.norm = TRUE; - tmp = lp_build_fetch_cached_texels(gallivm, + tmp = lp_build_fetch_s3tc_rgba_aos(gallivm, format_desc, num_pixels, base_ptr, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c deleted file mode 100644 index e08062dcacd..00000000000 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c +++ /dev/null @@ -1,374 +0,0 @@ -/************************************************************************** - * - * Copyright 2015 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "lp_bld_format.h" -#include "lp_bld_type.h" -#include "lp_bld_struct.h" -#include "lp_bld_const.h" -#include "lp_bld_flow.h" -#include "lp_bld_swizzle.h" - -#include "util/u_math.h" - - -/** - * @file - * Complex block-compression based formats are handled here by using a cache, - * so re-decoding of every pixel is not required. - * Especially for bilinear filtering, texel reuse is very high hence even - * a small cache helps. - * The elements in the cache are the decoded blocks - currently things - * are restricted to formats which are 4x4 block based, and the decoded - * texels must fit into 4x8 bits. - * The cache is direct mapped so hitrates aren't all that great and cache - * thrashing could happen. - * - * @author Roland Scheidegger <[email protected]> - */ - - -#if LP_BUILD_FORMAT_CACHE_DEBUG -static void -update_cache_access(struct gallivm_state *gallivm, - LLVMValueRef ptr, - unsigned count, - unsigned index) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef member_ptr, cache_access; - - assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL || - index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); - - member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, ""); - cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access"); - cache_access = LLVMBuildAdd(builder, cache_access, - LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), - count, 0), ""); - LLVMBuildStore(builder, cache_access, member_ptr); -} -#endif - - -static void -store_cached_block(struct gallivm_state *gallivm, - LLVMValueRef *col, - LLVMValueRef tag_value, - LLVMValueRef hash_index, - LLVMValueRef cache) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef ptr, indices[3]; - LLVMTypeRef type_ptr4x32; - unsigned count; - - type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); - indices[0] = lp_build_const_int32(gallivm, 0); - indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); - indices[2] = hash_index; - ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), ""); - LLVMBuildStore(builder, tag_value, ptr); - - indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); - hash_index = LLVMBuildMul(builder, hash_index, - lp_build_const_int32(gallivm, 16), ""); - for (count = 0; count < 4; count++) { - indices[2] = hash_index; - ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), ""); - ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, ""); - LLVMBuildStore(builder, col[count], ptr); - hash_index = LLVMBuildAdd(builder, hash_index, - lp_build_const_int32(gallivm, 4), ""); - } -} - - -static LLVMValueRef -lookup_cached_pixel(struct gallivm_state *gallivm, - LLVMValueRef ptr, - LLVMValueRef index) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef member_ptr, indices[3]; - - indices[0] = lp_build_const_int32(gallivm, 0); - indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); - indices[2] = index; - member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), ""); - return LLVMBuildLoad(builder, member_ptr, "cache_data"); -} - - -static LLVMValueRef -lookup_tag_data(struct gallivm_state *gallivm, - LLVMValueRef ptr, - LLVMValueRef index) -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMValueRef member_ptr, indices[3]; - - indices[0] = lp_build_const_int32(gallivm, 0); - indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); - indices[2] = index; - member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), ""); - return LLVMBuildLoad(builder, member_ptr, "tag_data"); -} - - -static void -update_cached_block(struct gallivm_state *gallivm, - const struct util_format_description *format_desc, - LLVMValueRef ptr_addr, - LLVMValueRef hash_index, - LLVMValueRef cache) - -{ - LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); - LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); - LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); - LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); - LLVMValueRef function; - LLVMValueRef tag_value, tmp_ptr; - LLVMValueRef col[4]; - unsigned i, j; - - /* - * Use format_desc->fetch_rgba_8unorm() for each pixel in the block. - * This doesn't actually make any sense whatsoever, someone would need - * to write a function doing this for all pixels in a block (either as - * an external c function or with generated code). Don't ask. - */ - - { - /* - * Function to call looks like: - * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) - */ - LLVMTypeRef ret_type; - LLVMTypeRef arg_types[4]; - LLVMTypeRef function_type; - - assert(format_desc->fetch_rgba_8unorm); - - ret_type = LLVMVoidTypeInContext(gallivm->context); - arg_types[0] = pi8t; - arg_types[1] = pi8t; - arg_types[2] = i32t; - arg_types[3] = i32t; - function_type = LLVMFunctionType(ret_type, arg_types, - ARRAY_SIZE(arg_types), 0); - - /* make const pointer for the C fetch_rgba_8unorm function */ - function = lp_build_const_int_pointer(gallivm, - func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); - - /* cast the callee pointer to the function's type */ - function = LLVMBuildBitCast(builder, function, - LLVMPointerType(function_type, 0), - "cast callee"); - } - - tmp_ptr = lp_build_array_alloca(gallivm, i32x4, - lp_build_const_int32(gallivm, 16), - "tmp_decode_store"); - tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); - - /* - * Invoke format_desc->fetch_rgba_8unorm() for each pixel. - * This is going to be really really slow. - * Note: the block store format is actually - * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ... - */ - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) { - LLVMValueRef args[4]; - LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4); - - /* - * Note we actually supply a pointer to the start of the block, - * not the start of the texture. - */ - args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, ""); - args[1] = ptr_addr; - args[2] = LLVMConstInt(i32t, i, 0); - args[3] = LLVMConstInt(i32t, j, 0); - LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); - } - } - - /* Finally store the block - pointless mem copy + update tag. */ - tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), ""); - for (i = 0; i < 4; ++i) { - LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i); - LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, ""); - col[i] = LLVMBuildLoad(builder, ptr, ""); - } - - tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr, - LLVMInt64TypeInContext(gallivm->context), ""); - store_cached_block(gallivm, col, tag_value, hash_index, cache); -} - - -/* - * Do a cached lookup. - * - * Returns (vectors of) 4x8 rgba aos value - */ -LLVMValueRef -lp_build_fetch_cached_texels(struct gallivm_state *gallivm, - const struct util_format_description *format_desc, - unsigned n, - LLVMValueRef base_ptr, - LLVMValueRef offset, - LLVMValueRef i, - LLVMValueRef j, - LLVMValueRef cache) - -{ - LLVMBuilderRef builder = gallivm->builder; - unsigned count, low_bit, log2size; - LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp; - LLVMValueRef ij_index, hash_index, hash_mask, block_index; - LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); - LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); - LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); - struct lp_type type; - struct lp_build_context bld32; - memset(&type, 0, sizeof type); - type.width = 32; - type.length = n; - - assert(format_desc->block.width == 4); - assert(format_desc->block.height == 4); - - lp_build_context_init(&bld32, gallivm, type); - - /* - * compute hash - we use direct mapped cache, the hash function could - * be better but it needs to be simple - * per-element: - * compare offset with offset stored at tag (hash) - * if not equal decode/store block, update tag - * extract color from cache - * assemble result vector - */ - - /* TODO: not ideal with 32bit pointers... */ - - low_bit = util_logbase2(format_desc->block.bits / 8); - log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE); - addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, ""); - ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, ""); - ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc); - /* For the hash function, first mask off the unused lowest bits. Then just - do some xor with address bits - only use lower 32bits */ - ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, ""); - ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, - lp_build_const_int_vec(gallivm, type, low_bit), ""); - /* This only really makes sense for size 64,128,256 */ - hash_index = ptr_addrtrunc; - ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, - lp_build_const_int_vec(gallivm, type, 2*log2size), ""); - hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, ""); - tmp = LLVMBuildLShr(builder, hash_index, - lp_build_const_int_vec(gallivm, type, log2size), ""); - hash_index = LLVMBuildXor(builder, hash_index, tmp, ""); - - hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1); - hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, ""); - ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), ""); - ij_index = LLVMBuildAdd(builder, ij_index, j, ""); - block_index = LLVMBuildShl(builder, hash_index, - lp_build_const_int_vec(gallivm, type, 4), ""); - block_index = LLVMBuildAdd(builder, ij_index, block_index, ""); - - if (n > 1) { - color = LLVMGetUndef(LLVMVectorType(i32t, n)); - for (count = 0; count < n; count++) { - LLVMValueRef index, cond, colorx; - LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx; - struct lp_build_if_state if_ctx; - - index = lp_build_const_int32(gallivm, count); - offsetx = LLVMBuildExtractElement(builder, offset, index, ""); - addrx = LLVMBuildZExt(builder, offsetx, i64t, ""); - addrx = LLVMBuildAdd(builder, addrx, addr, ""); - block_indexx = LLVMBuildExtractElement(builder, block_index, index, ""); - hash_indexx = LLVMBuildLShr(builder, block_indexx, - lp_build_const_int32(gallivm, 4), ""); - offset_stored = lookup_tag_data(gallivm, cache, hash_indexx); - cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, ""); - - lp_build_if(&if_ctx, gallivm, cond); - { - ptr_addrx = LLVMBuildIntToPtr(builder, addrx, - LLVMPointerType(i8t, 0), ""); - update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache); -#if LP_BUILD_FORMAT_CACHE_DEBUG - update_cache_access(gallivm, cache, 1, - LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); -#endif - } - lp_build_endif(&if_ctx); - - colorx = lookup_cached_pixel(gallivm, cache, block_indexx); - - color = LLVMBuildInsertElement(builder, color, colorx, - lp_build_const_int32(gallivm, count), ""); - } - } - else { - LLVMValueRef cond; - struct lp_build_if_state if_ctx; - - tmp = LLVMBuildZExt(builder, offset, i64t, ""); - addr = LLVMBuildAdd(builder, tmp, addr, ""); - offset_stored = lookup_tag_data(gallivm, cache, hash_index); - cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, ""); - - lp_build_if(&if_ctx, gallivm, cond); - { - tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), ""); - update_cached_block(gallivm, format_desc, tmp, hash_index, cache); -#if LP_BUILD_FORMAT_CACHE_DEBUG - update_cache_access(gallivm, cache, 1, - LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); -#endif - } - lp_build_endif(&if_ctx); - - color = lookup_cached_pixel(gallivm, cache, block_index); - } -#if LP_BUILD_FORMAT_CACHE_DEBUG - update_cache_access(gallivm, cache, n, - LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL); -#endif - return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), ""); -} - diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c new file mode 100644 index 00000000000..2b143566f24 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c @@ -0,0 +1,2229 @@ +/************************************************************************** + * + * Copyright 2010-2018 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +/** + * @file + * s3tc pixel format manipulation. + * + * @author Roland Scheidegger <[email protected]> + */ + + +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_string.h" +#include "util/u_cpu_detect.h" +#include "util/u_debug.h" + +#include "lp_bld_arit.h" +#include "lp_bld_type.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_gather.h" +#include "lp_bld_format.h" +#include "lp_bld_logic.h" +#include "lp_bld_pack.h" +#include "lp_bld_flow.h" +#include "lp_bld_printf.h" +#include "lp_bld_struct.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_init.h" +#include "lp_bld_debug.h" +#include "lp_bld_intr.h" + + +/** + * Reverse an interleave2_half + * (ie. pick every second element, independent lower/upper halfs) + * sse2 can only do that with 32bit (shufps) or larger elements + * natively. (Otherwise, and/pack (even) or shift/pack (odd) + * could be used, ideally llvm would do that for us.) + * XXX: Unfortunately, this does NOT translate to a shufps if those + * are int vectors (and casting will not help, llvm needs to recognize it + * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq + * sequence which I'm pretty sure is a lot worse despite domain transition + * penalties with shufps (except maybe on Nehalem). + */ +static LLVMValueRef +lp_build_uninterleave2_half(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef a, + LLVMValueRef b, + unsigned lo_hi) +{ + LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH]; + unsigned i, j; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + assert(lo_hi < 2); + + if (type.length * type.width == 256) { + assert(type.length >= 4); + for (i = 0, j = 0; i < type.length; ++i) { + if (i == type.length / 4) { + j = type.length; + } else if (i == type.length / 2) { + j = type.length / 2; + } else if (i == 3 * type.length / 4) { + j = 3 * type.length / 4; + } else { + j += 2; + } + elems[i] = lp_build_const_int32(gallivm, j + lo_hi); + } + } else { + for (i = 0; i < type.length; ++i) { + elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi); + } + } + + shuffle = LLVMConstVector(elems, type.length); + + return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, ""); + +} + + +/** + * Build shuffle for extending vectors. + */ +static LLVMValueRef +lp_build_const_extend_shuffle(struct gallivm_state *gallivm, + unsigned n, unsigned length) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(n <= length); + assert(length <= LP_MAX_VECTOR_LENGTH); + + /* TODO: cache results in a static table */ + + for(i = 0; i < n; i++) { + elems[i] = lp_build_const_int32(gallivm, i); + } + for (i = n; i < length; i++) { + elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + } + + return LLVMConstVector(elems, length); +} + +static LLVMValueRef +lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i, j; + + assert(n <= LP_MAX_VECTOR_LENGTH); + + /* TODO: cache results in a static table */ + + for(i = 0, j = 0; i < n; i += 2, ++j) { + elems[i + 0] = lp_build_const_int32(gallivm, 0 + j); + elems[i + 1] = lp_build_const_int32(gallivm, n + j); + elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j); + elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j); + } + + return LLVMConstVector(elems, n * 2); +} + +/* + * broadcast 1 element to all elements + */ +static LLVMValueRef +lp_build_const_shuffle1(struct gallivm_state *gallivm, + unsigned index, unsigned n) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + assert(n <= LP_MAX_VECTOR_LENGTH); + + /* TODO: cache results in a static table */ + + for (i = 0; i < n; i++) { + elems[i] = lp_build_const_int32(gallivm, index); + } + + return LLVMConstVector(elems, n); +} + +/* + * move 1 element to pos 0, rest undef + */ +static LLVMValueRef +lp_build_shuffle1undef(struct gallivm_state *gallivm, + LLVMValueRef a, unsigned index, unsigned n) +{ + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf; + unsigned i; + + assert(n <= LP_MAX_VECTOR_LENGTH); + + elems[0] = lp_build_const_int32(gallivm, index); + + for (i = 1; i < n; i++) { + elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + } + shuf = LLVMConstVector(elems, n); + + return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, ""); +} + +static boolean +format_dxt1_variant(enum pipe_format format) +{ + return format == PIPE_FORMAT_DXT1_RGB || + format == PIPE_FORMAT_DXT1_RGBA || + format == PIPE_FORMAT_DXT1_SRGB || + format == PIPE_FORMAT_DXT1_SRGBA; + +} + +/** + * Gather elements from scatter positions in memory into vectors. + * This is customised for fetching texels from s3tc textures. + * For SSE, typical value is length=4. + * + * @param length length of the offsets + * @param colors the stored colors of the blocks will be extracted into this. + * @param codewords the codewords of the blocks will be extracted into this. + * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5 + * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5 + * @param base_ptr base pointer, should be a i8 pointer type. + * @param offsets vector with offsets + */ +static void +lp_build_gather_s3tc(struct gallivm_state *gallivm, + unsigned length, + const struct util_format_description *format_desc, + LLVMValueRef *colors, + LLVMValueRef *codewords, + LLVMValueRef *alpha_lo, + LLVMValueRef *alpha_hi, + LLVMValueRef base_ptr, + LLVMValueRef offsets) +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned block_bits = format_desc->block.bits; + unsigned i; + LLVMValueRef elems[8]; + LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context); + LLVMTypeRef type32dxt; + struct lp_type lp_type32dxt; + + memset(&lp_type32dxt, 0, sizeof lp_type32dxt); + lp_type32dxt.width = 32; + lp_type32dxt.length = block_bits / 32; + type32dxt = lp_build_vec_type(gallivm, lp_type32dxt); + + assert(block_bits == 64 || block_bits == 128); + assert(length == 1 || length == 4 || length == 8); + + for (i = 0; i < length; ++i) { + elems[i] = lp_build_gather_elem(gallivm, length, + block_bits, block_bits, TRUE, + base_ptr, offsets, i, FALSE); + elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, ""); + } + if (length == 1) { + LLVMValueRef elem = elems[0]; + if (block_bits == 128) { + *alpha_lo = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 0), ""); + *alpha_hi = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 1), ""); + *colors = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 2), ""); + *codewords = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 3), ""); + } + else { + *alpha_lo = LLVMGetUndef(type32); + *alpha_hi = LLVMGetUndef(type32); + *colors = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 0), ""); + *codewords = LLVMBuildExtractElement(builder, elem, + lp_build_const_int32(gallivm, 1), ""); + } + } + else { + LLVMValueRef tmp[4], cc01, cc23; + struct lp_type lp_type32, lp_type64, lp_type32dxt; + memset(&lp_type32, 0, sizeof lp_type32); + lp_type32.width = 32; + lp_type32.length = length; + memset(&lp_type64, 0, sizeof lp_type64); + lp_type64.width = 64; + lp_type64.length = length/2; + + if (block_bits == 128) { + if (length == 8) { + for (i = 0; i < 4; ++i) { + tmp[0] = elems[i]; + tmp[1] = elems[i+4]; + elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2); + } + } + lp_build_transpose_aos(gallivm, lp_type32, elems, tmp); + *colors = tmp[2]; + *codewords = tmp[3]; + *alpha_lo = tmp[0]; + *alpha_hi = tmp[1]; + } else { + LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2); + LLVMTypeRef type32_vec = LLVMVectorType(type32, length); + + for (i = 0; i < length; ++i) { + /* no-op shuffle */ + elems[i] = LLVMBuildShuffleVector(builder, elems[i], + LLVMGetUndef(type32dxt), + lp_build_const_extend_shuffle(gallivm, 2, 4), ""); + } + if (length == 8) { + for (i = 0; i < 4; ++i) { + tmp[0] = elems[i]; + tmp[1] = elems[i+4]; + elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2); + } + } + cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0); + cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0); + cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, ""); + cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, ""); + *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0); + *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1); + *colors = LLVMBuildBitCast(builder, *colors, type32_vec, ""); + *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, ""); + } + } +} + +/** Convert from <n x i32> containing 2 x n rgb565 colors + * to 2 <n x i32> rgba8888 colors + * This is the most optimized version I can think of + * should be nearly as fast as decoding only one color + * NOTE: alpha channel will be set to 0 + * @param colors is a <n x i32> vector containing the rgb565 colors + */ +static void +color_expand2_565_to_8888(struct gallivm_state *gallivm, + unsigned n, + LLVMValueRef colors, + LLVMValueRef *color0, + LLVMValueRef *color1) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef r, g, b, rblo, glo; + LLVMValueRef rgblomask, rb, rgb0, rgb1; + struct lp_type type, type16, type8; + + assert(n > 1); + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 2 * n; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 4 * n; + + rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707); + colors = LLVMBuildBitCast(builder, colors, + lp_build_vec_type(gallivm, type16), ""); + /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits) + * make sure low bits of r are zero - could use AND but requires constant */ + r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), ""); + r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), ""); + b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), ""); + rb = LLVMBuildOr(builder, r, b, ""); + rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), ""); + /* don't have byte shift hence need mask */ + rblo = LLVMBuildAnd(builder, rblo, rgblomask, ""); + rb = LLVMBuildOr(builder, rb, rblo, ""); + + /* make sure low bits of g are zero */ + g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), ""); + g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), ""); + glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), ""); + g = LLVMBuildOr(builder, g, glo, ""); + + rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), ""); + g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), ""); + rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0); + rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1); + + rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), ""); + rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), ""); + + /* rgb0 is rgb00, rgb01, rgb10, rgb11 + * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle + * on x86 this _should_ just generate one shufps... + */ + *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0); + *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1); +} + + +/** Convert from <n x i32> containing rgb565 colors + * (in first 16 bits) to <n x i32> rgba8888 colors + * bits 16-31 MBZ + * NOTE: alpha channel will be set to 0 + * @param colors is a <n x i32> vector containing the rgb565 colors + */ +static LLVMValueRef +color_expand_565_to_8888(struct gallivm_state *gallivm, + unsigned n, + LLVMValueRef colors) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef rgba, r, g, b, rgblo, glo; + LLVMValueRef rbhimask, g6mask, rgblomask; + struct lp_type type; + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + /* color expansion: + * first extract and shift colors into their final locations + * (high bits - low bits zero at this point) + * then replicate highest bits to the lowest bits + * note rb replication can be done in parallel but not g + * (different shift) + * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f + * rhigh = 8, ghigh = 5, bhigh = 19 + * rblow = 5, glow = 6 + * rgblowmask = 0x00070307 + * r = colors >> rhigh + * b = colors << bhigh + * g = (colors & g6mask) << ghigh + * rb = (r | b) rbhimask + * rbtmp = rb >> rblow + * gtmp = rb >> glow + * rbtmp = rbtmp | gtmp + * rbtmp = rbtmp & rgblowmask + * rgb = rb | g | rbtmp + */ + g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0); + rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8); + rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307); + + r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), ""); + b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), ""); + g = LLVMBuildAnd(builder, colors, g6mask, ""); + g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), ""); + rgba = LLVMBuildOr(builder, r, b, ""); + rgba = LLVMBuildAnd(builder, rgba, rbhimask, ""); + rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), ""); + glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), ""); + rgblo = LLVMBuildOr(builder, rgblo, glo, ""); + rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, ""); + rgba = LLVMBuildOr(builder, rgba, g, ""); + rgba = LLVMBuildOr(builder, rgba, rgblo, ""); + + return rgba; +} + + +/** + * Calculate 1/3(v1-v0) + v0 + * and 2*1/3(v1-v0) + v0 + */ +static void +lp_build_lerp23(struct lp_build_context *bld, + LLVMValueRef v0, + LLVMValueRef v1, + LLVMValueRef *res0, + LLVMValueRef *res1) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi; + LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp; + const struct lp_type type = bld->type; + LLVMBuilderRef builder = bld->gallivm->builder; + struct lp_type i16_type = lp_wider_type(type); + struct lp_build_context bld2; + + assert(lp_check_value(type, v0)); + assert(lp_check_value(type, v1)); + assert(!type.floating && !type.fixed && !type.norm && type.width == 8); + + lp_build_context_init(&bld2, gallivm, i16_type); + bld2.type.sign = TRUE; + x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3); + + /* FIXME: use native avx256 unpack/pack */ + lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi); + lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi); + lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi); + delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo); + delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi); + + mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, ""); + mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, ""); + + x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), ""); + x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), ""); + /* lerp optimization: pack now, do add afterwards */ + tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi); + *res0 = lp_build_add(bld, tmp, v0); + + x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), ""); + x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), ""); + /* unlike above still need mask (but add still afterwards). */ + x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), ""); + x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), ""); + tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi); + *res1 = lp_build_add(bld, tmp, v0); +} + +/** + * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS + * @param colors is a <n x i32> vector with n x 2x16bit colors + * @param codewords is a <n x i32> vector containing the codewords + * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3) + * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3) + */ +static LLVMValueRef +s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm, + unsigned n, + enum pipe_format format, + LLVMValueRef colors, + LLVMValueRef codewords, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef color0, color1, color2, color3, color2_2, color3_2; + LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2; + LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices; + struct lp_type type, type8; + struct lp_build_context bld8, bld32; + boolean is_dxt1_variant = format_dxt1_variant(format); + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 4*n; + + assert(lp_check_value(type, i)); + assert(lp_check_value(type, j)); + + a = lp_build_const_int_vec(gallivm, type, 0xff000000); + + lp_build_context_init(&bld32, gallivm, type); + lp_build_context_init(&bld8, gallivm, type8); + + /* + * works as follows: + * - expand color0/color1 to rgba8888 + * - calculate color2/3 (interpolation) according to color0 < color1 rules + * - calculate color2/3 according to color0 >= color1 rules + * - do selection of color2/3 according to comparison of color0/1 + * - extract indices (vector shift). + * - use compare/select to select the correct color. Since we have 2bit + * indices (and 4 colors), needs at least three compare/selects. + */ + /* + * expand the two colors + */ + col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), ""); + col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), ""); + if (n > 1) { + color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1); + } + else { + color0 = color_expand_565_to_8888(gallivm, n, col0); + color1 = color_expand_565_to_8888(gallivm, n, col1); + } + + /* + * interpolate colors + * color2_1 is 2/3 color0 + 1/3 color1 + * color3_1 is 1/3 color0 + 2/3 color1 + * color2_2 is 1/2 color0 + 1/2 color1 + * color3_2 is 0 + */ + + colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, ""); + colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, ""); + /* can combine 2 lerps into one mostly - still looks expensive enough. */ + lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3); + color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, ""); + color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, ""); + + /* dxt3/5 always use 4-color encoding */ + if (is_dxt1_variant) { + /* fix up alpha */ + if (format == PIPE_FORMAT_DXT1_RGBA || + format == PIPE_FORMAT_DXT1_SRGBA) { + color0 = LLVMBuildOr(builder, color0, a, ""); + color1 = LLVMBuildOr(builder, color1, a, ""); + color3 = LLVMBuildOr(builder, color3, a, ""); + } + /* + * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1. + * Much cheaper (but we don't care that much if n == 1). + */ + if ((util_cpu_caps.has_sse2 && n == 4) || + (util_cpu_caps.has_avx2 && n == 8)) { + LLVMValueRef intrargs[2]; + char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" : + "llvm.x86.sse2.pavg.b"; + intrargs[0] = colors0; + intrargs[1] = colors1; + color2_2 = lp_build_intrinsic(builder, intr_name, + bld8.vec_type, intrargs, 2, 0); + color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, ""); + } + else { + struct lp_type i16_type = lp_wider_type(type8); + struct lp_build_context bld2; + LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi; + + lp_build_context_init(&bld2, gallivm, i16_type); + bld2.type.sign = TRUE; + + /* + * This isn't as expensive as it looks (the unpack is the same as + * for lerp23), with correct rounding. + * (Note that while rounding is correct, this will always round down, + * whereas pavgb will always round up.) + */ + /* FIXME: use native avx256 unpack/pack */ + lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi); + lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi); + + addlo = lp_build_add(&bld2, v0_lo, v1_lo); + addhi = lp_build_add(&bld2, v0_hi, v1_hi); + addlo = LLVMBuildLShr(builder, addlo, + lp_build_const_int_vec(gallivm, i16_type, 1), ""); + addhi = LLVMBuildLShr(builder, addhi, + lp_build_const_int_vec(gallivm, i16_type, 1), ""); + color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi); + color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, ""); + } + color3_2 = lp_build_const_int_vec(gallivm, type, 0); + + /* select between colors2/3 */ + /* signed compare is faster saves some xors */ + type.sign = TRUE; + sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1); + color2 = lp_build_select(&bld32, sel_mask, color2, color2_2); + color3 = lp_build_select(&bld32, sel_mask, color3, color3_2); + type.sign = FALSE; + + if (format == PIPE_FORMAT_DXT1_RGBA || + format == PIPE_FORMAT_DXT1_SRGBA) { + color2 = LLVMBuildOr(builder, color2, a, ""); + } + } + + const2 = lp_build_const_int_vec(gallivm, type, 2); + /* extract 2-bit index values */ + bit_pos = LLVMBuildShl(builder, j, const2, ""); + bit_pos = LLVMBuildAdd(builder, bit_pos, i, ""); + bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, ""); + /* + * NOTE: This innocent looking shift is very expensive with x86/ssex. + * Shifts with per-elemnent shift count get roughly translated to + * extract (count), extract (value), shift, move (back to xmm), unpack + * per element! + * So about 20 instructions here for 4xi32. + * Newer llvm versions (3.7+) will not do extract/insert but use a + * a couple constant count vector shifts plus shuffles. About same + * amount of instructions unfortunately... + * Would get much worse with 8xi16 even... + * We could actually do better here: + * - subtract bit_pos from 128+30, shl 23, convert float to int... + * - now do mul with codewords followed by shr 30... + * But requires 32bit->32bit mul, sse41 only (well that's emulatable + * with 2 32bit->64bit muls...) and not exactly cheap + * AVX2, of course, fixes this nonsense. + */ + indices = LLVMBuildLShr(builder, codewords, bit_pos, ""); + + /* finally select the colors */ + sel_lo = LLVMBuildAnd(builder, indices, bld32.one, ""); + sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one); + color0 = lp_build_select(&bld32, sel_lo, color1, color0); + color2 = lp_build_select(&bld32, sel_lo, color3, color2); + sel_hi = LLVMBuildAnd(builder, indices, const2, ""); + sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2); + rgba = lp_build_select(&bld32, sel_hi, color2, color0); + + /* fix up alpha */ + if (format == PIPE_FORMAT_DXT1_RGB || + format == PIPE_FORMAT_DXT1_SRGB) { + rgba = LLVMBuildOr(builder, rgba, a, ""); + } + return LLVMBuildBitCast(builder, rgba, bld8.vec_type, ""); +} + + +static LLVMValueRef +s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm, + unsigned n, + enum pipe_format format, + LLVMValueRef colors, + LLVMValueRef codewords, + LLVMValueRef i, + LLVMValueRef j) +{ + return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format, + colors, codewords, i, j); +} + + +/** + * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS + * @param colors is a <n x i32> vector with n x 2x16bit colors + * @param codewords is a <n x i32> vector containing the codewords + * @param alphas is a <n x i64> vector containing the alpha values + * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3) + * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3) + */ +static LLVMValueRef +s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm, + unsigned n, + enum pipe_format format, + LLVMValueRef colors, + LLVMValueRef codewords, + LLVMValueRef alpha_low, + LLVMValueRef alpha_hi, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef rgba, tmp, tmp2; + LLVMValueRef bit_pos, sel_mask; + struct lp_type type, type8; + struct lp_build_context bld; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = n*4; + + assert(lp_check_value(type, i)); + assert(lp_check_value(type, j)); + + lp_build_context_init(&bld, gallivm, type); + + rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format, + colors, codewords, i, j); + + rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, ""); + + /* + * Extract alpha values. Since we now need to select from + * which 32bit vector values are fetched, construct selection + * mask from highest bit of bit_pos, and use select, then shift + * according to the bit_pos (without the highest bit). + * Note this is pointless for n == 1 case. Could just + * directly use 64bit arithmetic if we'd extract 64bit + * alpha value instead of 2x32... + */ + /* pos = 4*(4j+i) */ + bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), ""); + bit_pos = LLVMBuildAdd(builder, bit_pos, i, ""); + bit_pos = LLVMBuildShl(builder, bit_pos, + lp_build_const_int_vec(gallivm, type, 2), ""); + sel_mask = LLVMBuildLShr(builder, bit_pos, + lp_build_const_int_vec(gallivm, type, 5), ""); + sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, ""); + tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi); + bit_pos = LLVMBuildAnd(builder, bit_pos, + lp_build_const_int_vec(gallivm, type, 0xffffffdf), ""); + /* Warning: slow shift with per element count */ + /* + * Could do pshufb here as well - just use appropriate 2 bits in bit_pos + * to select the right byte with pshufb. Then for the remaining one bit + * just do shift/select. + */ + tmp = LLVMBuildLShr(builder, tmp, bit_pos, ""); + + /* combined expand from a4 to a8 and shift into position */ + tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), ""); + tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), ""); + tmp = LLVMBuildOr(builder, tmp, tmp2, ""); + + rgba = LLVMBuildOr(builder, tmp, rgba, ""); + + return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), ""); +} + +static LLVMValueRef +lp_build_lerpdxta(struct gallivm_state *gallivm, + LLVMValueRef alpha0, + LLVMValueRef alpha1, + LLVMValueRef code, + LLVMValueRef sel_mask, + unsigned n) +{ + /* + * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41 + * (plus pmullw is actually faster...) + * we just pretend our 32bit values (which are really only 8bit) are 16bits. + * Note that this is obviously a disaster for the scalar case. + */ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef delta, ainterp; + LLVMValueRef weight5, weight7, weight; + struct lp_type type32, type16, type8; + struct lp_build_context bld16; + + memset(&type32, 0, sizeof type32); + type32.width = 32; + type32.length = n; + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 2*n; + type16.sign = TRUE; + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 4*n; + + lp_build_context_init(&bld16, gallivm, type16); + /* 255/7 is a bit off - increase accuracy at the expense of shift later */ + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, ""); + weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5); + weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7); + weight = lp_build_select(&bld16, sel_mask, weight7, weight5); + + alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, ""); + alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, ""); + code = LLVMBuildBitCast(builder, code, bld16.vec_type, ""); + /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7) + but we don't care */ + code = LLVMBuildSub(builder, code, bld16.one, ""); + + weight = LLVMBuildMul(builder, weight, code, ""); + weight = LLVMBuildLShr(builder, weight, + lp_build_const_int_vec(gallivm, type16, 6), ""); + + delta = LLVMBuildSub(builder, alpha1, alpha0, ""); + + ainterp = LLVMBuildMul(builder, delta, weight, ""); + ainterp = LLVMBuildLShr(builder, ainterp, + lp_build_const_int_vec(gallivm, type16, 8), ""); + + ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), ""); + alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), ""); + ainterp = LLVMBuildAdd(builder, alpha0, ainterp, ""); + ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), ""); + + return ainterp; +} + +/** + * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS + * @param colors is a <n x i32> vector with n x 2x16bit colors + * @param codewords is a <n x i32> vector containing the codewords + * @param alphas is a <n x i64> vector containing the alpha values + * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3) + * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3) + */ +static LLVMValueRef +s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm, + unsigned n, + enum pipe_format format, + LLVMValueRef colors, + LLVMValueRef codewords, + LLVMValueRef alpha_lo, + LLVMValueRef alpha_hi, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift; + LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s; + LLVMValueRef mask6, mask7, ainterp; + LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + struct lp_type type, type8; + struct lp_build_context bld32; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = n*4; + + assert(lp_check_value(type, i)); + assert(lp_check_value(type, j)); + + lp_build_context_init(&bld32, gallivm, type); + + assert(lp_check_value(type, i)); + assert(lp_check_value(type, j)); + + rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format, + colors, codewords, i, j); + + rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, ""); + + /* this looks pretty complex for vectorization: + * extract a0/a1 values + * extract code + * select weights for interpolation depending on a0 > a1 + * mul weights by code - 1 + * lerp a0/a1/weights + * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0 + */ + + alpha0 = LLVMBuildAnd(builder, alpha_lo, + lp_build_const_int_vec(gallivm, type, 0xff), ""); + alpha1 = LLVMBuildLShr(builder, alpha_lo, + lp_build_const_int_vec(gallivm, type, 8), ""); + alpha1 = LLVMBuildAnd(builder, alpha1, + lp_build_const_int_vec(gallivm, type, 0xff), ""); + + /* pos = 3*(4j+i) */ + bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), ""); + bit_pos = LLVMBuildAdd(builder, bit_pos, i, ""); + tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, ""); + bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, ""); + /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */ + bit_pos = LLVMBuildAdd(builder, bit_pos, + lp_build_const_int_vec(gallivm, type, 16), ""); + + if (n == 1) { + struct lp_type type64; + memset(&type64, 0, sizeof type64); + type64.width = 64; + type64.length = 1; + /* This is pretty pointless could avoid by just directly extracting + 64bit in the first place but makes it more complicated elsewhere */ + alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, ""); + alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, ""); + alphac0 = LLVMBuildShl(builder, alpha_hi, + lp_build_const_int_vec(gallivm, type64, 32), ""); + alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, ""); + + shift = LLVMBuildZExt(builder, bit_pos, i64t, ""); + alphac0 = LLVMBuildLShr(builder, alphac0, shift, ""); + alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, ""); + alphac = LLVMBuildAnd(builder, alphac0, + lp_build_const_int_vec(gallivm, type, 0x7), ""); + } + else { + /* + * Using non-native vector length here (actually, with avx2 and + * n == 4 llvm will indeed expand to ymm regs...) + * At least newer llvm versions handle that ok. + * llvm 3.7+ will even handle the emulated 64bit shift with variable + * shift count without extraction (and it's actually easier to + * emulate than the 32bit one). + */ + alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi, + lp_build_const_unpackx2_shuffle(gallivm, n), ""); + + alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), ""); + shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), ""); + alphac = LLVMBuildLShr(builder, alpha64, shift, ""); + alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, ""); + + alphac = LLVMBuildAnd(builder, alphac, + lp_build_const_int_vec(gallivm, type, 0x7), ""); + } + + /* signed compare is faster saves some xors */ + type.sign = TRUE; + /* alpha0 > alpha1 selection */ + sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, + alpha0, alpha1); + ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n); + + /* + * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise. + * else we select a0 for case 0, a1 for case 1, + * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7 + * a = (c == 0) ? a0 : a1 + * a = (c > 1) ? ainterp : a + * Finally handle case 6/7 for !(a0 > a1) + * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask) + * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask) + */ + tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, + alphac, bld32.zero); + alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1); + tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, + alphac, bld32.one); + alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha); + + code_s = LLVMBuildAnd(builder, alphac, + LLVMBuildNot(builder, sel_mask, ""), ""); + mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, + code_s, lp_build_const_int_vec(gallivm, type, 6)); + mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, + code_s, lp_build_const_int_vec(gallivm, type, 7)); + alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), ""); + alpha = LLVMBuildOr(builder, alpha, mask7, ""); + + alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), ""); + rgba = LLVMBuildOr(builder, alpha, rgba, ""); + + return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), ""); +} + + +static void +lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + LLVMValueRef *dxt_block, + LLVMValueRef ptr) +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned block_bits = format_desc->block.bits; + LLVMValueRef elem, shuf; + LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32); + LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits); + LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); + LLVMTypeRef type32_4 = LLVMVectorType(type32, 4); + + assert(block_bits == 64 || block_bits == 128); + + ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, ""); + elem = LLVMBuildLoad(builder, ptr, ""); + + if (block_bits == 128) { + /* just return block as is */ + *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, ""); + } + else { + LLVMTypeRef type32_2 = LLVMVectorType(type32, 2); + shuf = lp_build_const_extend_shuffle(gallivm, 2, 4); + elem = LLVMBuildBitCast(builder, elem, type32_2, ""); + *dxt_block = LLVMBuildShuffleVector(builder, elem, + LLVMGetUndef(type32_2), shuf, ""); + } +} + + +static void +s3tc_store_cached_block(struct gallivm_state *gallivm, + LLVMValueRef *col, + LLVMValueRef tag_value, + LLVMValueRef hash_index, + LLVMValueRef cache) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef ptr, indices[3]; + LLVMTypeRef type_ptr4x32; + unsigned count; + + type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), ""); + LLVMBuildStore(builder, tag_value, ptr); + + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + hash_index = LLVMBuildMul(builder, hash_index, + lp_build_const_int32(gallivm, 16), ""); + for (count = 0; count < 4; count++) { + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), ""); + ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, ""); + LLVMBuildStore(builder, col[count], ptr); + hash_index = LLVMBuildAdd(builder, hash_index, + lp_build_const_int32(gallivm, 4), ""); + } +} + +static LLVMValueRef +s3tc_lookup_cached_pixel(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "cache_data"); +} + +static LLVMValueRef +s3tc_lookup_tag_data(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "tag_data"); +} + +#if LP_BUILD_FORMAT_CACHE_DEBUG +static void +s3tc_update_cache_access(struct gallivm_state *gallivm, + LLVMValueRef ptr, + unsigned count, + unsigned index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, cache_access; + + assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL || + index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); + + member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, ""); + cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access"); + cache_access = LLVMBuildAdd(builder, cache_access, + LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), + count, 0), ""); + LLVMBuildStore(builder, cache_access, member_ptr); +} +#endif + +/** + * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0. + * The lerp is performed between the first 2 32bit colors + * in the source vector, both results are returned packed in result vector. + */ +static LLVMValueRef +lp_build_lerp23_single(struct lp_build_context *bld, + LLVMValueRef v01) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMValueRef x, mul, delta, res, v0, v1, elems[8]; + const struct lp_type type = bld->type; + LLVMBuilderRef builder = bld->gallivm->builder; + struct lp_type i16_type = lp_wider_type(type); + struct lp_type i32_type = lp_wider_type(i16_type); + struct lp_build_context bld2; + + assert(!type.floating && !type.fixed && !type.norm && type.width == 8); + + lp_build_context_init(&bld2, gallivm, i16_type); + bld2.type.sign = TRUE; + + /* weights 256/3, 256*2/3, with correct rounding */ + elems[0] = elems[1] = elems[2] = elems[3] = + lp_build_const_elem(gallivm, i16_type, 255*1/3); + elems[4] = elems[5] = elems[6] = elems[7] = + lp_build_const_elem(gallivm, i16_type, 171); + x = LLVMConstVector(elems, 8); + + /* + * v01 has col0 in 32bit elem 0, col1 in elem 1. + * Interleave/unpack will give us separate v0/v1 vectors. + */ + v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0); + v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, ""); + + lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1); + delta = lp_build_sub(&bld2, v1, v0); + + mul = LLVMBuildMul(builder, x, delta, ""); + + mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), ""); + /* lerp optimization: pack now, do add afterwards */ + res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef); + /* only lower 2 elems are valid - for these v0 is really v0 */ + return lp_build_add(bld, res, v01); +} + +/* + * decode one dxt1 block. + */ +static void +s3tc_decode_block_dxt1(struct gallivm_state *gallivm, + enum pipe_format format, + LLVMValueRef dxt_block, + LLVMValueRef *col) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef color01, color23, color01_16, color0123; + LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2; + struct lp_type type8, type32, type16, type64; + struct lp_build_context bld8, bld32, bld16, bld64; + unsigned i; + boolean is_dxt1_variant = format_dxt1_variant(format); + + memset(&type32, 0, sizeof type32); + type32.width = 32; + type32.length = 4; + type32.sign = TRUE; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 16; + + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 8; + + memset(&type64, 0, sizeof type64); + type64.width = 64; + type64.length = 2; + + a = lp_build_const_int_vec(gallivm, type32, 0xff000000); + const2 = lp_build_const_int_vec(gallivm, type32, 2); + + lp_build_context_init(&bld32, gallivm, type32); + lp_build_context_init(&bld16, gallivm, type16); + lp_build_context_init(&bld8, gallivm, type8); + lp_build_context_init(&bld64, gallivm, type64); + + if (is_dxt1_variant) { + color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4); + code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4); + } else { + color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4); + code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4); + } + code = LLVMBuildBitCast(builder, code, bld8.vec_type, ""); + /* expand bytes to dwords */ + code = lp_build_interleave2(gallivm, type8, code, code, 0); + code = lp_build_interleave2(gallivm, type8, code, code, 0); + + + /* + * works as follows: + * - expand color0/color1 to rgba8888 + * - calculate color2/3 (interpolation) according to color0 < color1 rules + * - calculate color2/3 according to color0 >= color1 rules + * - do selection of color2/3 according to comparison of color0/1 + * - extract indices. + * - use compare/select to select the correct color. Since we have 2bit + * indices (and 4 colors), needs at least three compare/selects. + */ + + /* + * expand the two colors + */ + color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, ""); + color01 = lp_build_interleave2(gallivm, type16, color01, + bld16.zero, 0); + color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, ""); + color01 = color_expand_565_to_8888(gallivm, 4, color01_16); + + /* + * interpolate colors + * color2_1 is 2/3 color0 + 1/3 color1 + * color3_1 is 1/3 color0 + 2/3 color1 + * color2_2 is 1/2 color0 + 1/2 color1 + * color3_2 is 0 + */ + + /* TODO: since this is now always scalar, should + * probably just use control flow here instead of calculating + * both cases and then selection + */ + if (format == PIPE_FORMAT_DXT1_RGBA || + format == PIPE_FORMAT_DXT1_SRGBA) { + color01 = LLVMBuildOr(builder, color01, a, ""); + } + /* can combine 2 lerps into one mostly */ + color23 = lp_build_lerp23_single(&bld8, color01); + color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, ""); + + /* dxt3/5 always use 4-color encoding */ + if (is_dxt1_variant) { + LLVMValueRef color23_2, color2_2; + + if (util_cpu_caps.has_sse2) { + LLVMValueRef intrargs[2]; + intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, ""); + /* same interleave as for lerp23 - correct result in 2nd element */ + intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0); + intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, ""); + color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b", + bld8.vec_type, intrargs, 2, 0); + } + else { + LLVMValueRef v01, v0, v1, vhalf; + /* + * This isn't as expensive as it looks (the unpack is the same as + * for lerp23, which is the reason why we do the pointless + * interleave2 too), with correct rounding (the two lower elements + * will be the same). + */ + v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0); + v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, ""); + lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1); + vhalf = lp_build_add(&bld16, v0, v1); + vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, ""); + color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef); + } + /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */ + color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, ""); + color23_2 = LLVMBuildLShr(builder, color23_2, + lp_build_const_int_vec(gallivm, type64, 32), ""); + color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, ""); + + tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, ""); + tmp = LLVMBuildLShr(builder, tmp, + lp_build_const_int_vec(gallivm, type64, 32), ""); + tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, ""); + sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER, + color01_16, tmp); + sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0); + color23 = lp_build_select(&bld32, sel_mask, color23, color23_2); + } + + if (util_cpu_caps.has_ssse3) { + /* + * Use pshufb as mini-lut. (Only doable with intrinsics as the + * final shuffles are non-constant. pshufb is awesome!) + */ + LLVMValueRef shuf[16], low2mask; + LLVMValueRef intrargs[2], lut_ind, lut_adj; + + color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, ""); + color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, ""); + color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0); + color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, ""); + + if (format == PIPE_FORMAT_DXT1_RGB || + format == PIPE_FORMAT_DXT1_SRGB) { + color0123 = LLVMBuildOr(builder, color0123, a, ""); + } + + /* shuffle as r0r1r2r3g0g1... */ + for (i = 0; i < 4; i++) { + shuf[4*i] = lp_build_const_int32(gallivm, 0 + i); + shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i); + shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i); + shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i); + } + color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, ""); + color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef, + LLVMConstVector(shuf, 16), ""); + + /* lowest 2 bits of each 8 bit value contain index into "LUT" */ + low2mask = lp_build_const_int_vec(gallivm, type8, 3); + /* add 0/4/8/12 for r/g/b/a */ + lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400); + lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, ""); + intrargs[0] = color0123; + for (i = 0; i < 4; i++) { + lut_ind = LLVMBuildAnd(builder, code, low2mask, ""); + lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, ""); + intrargs[1] = lut_ind; + col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128", + bld8.vec_type, intrargs, 2, 0); + col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, ""); + code = LLVMBuildBitCast(builder, code, bld32.vec_type, ""); + code = LLVMBuildLShr(builder, code, const2, ""); + code = LLVMBuildBitCast(builder, code, bld8.vec_type, ""); + } + } + else { + /* Thanks to vectorization can do 4 texels in parallel */ + LLVMValueRef color0, color1, color2, color3; + if (format == PIPE_FORMAT_DXT1_RGB || + format == PIPE_FORMAT_DXT1_SRGB) { + color01 = LLVMBuildOr(builder, color01, a, ""); + color23 = LLVMBuildOr(builder, color23, a, ""); + } + color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef, + lp_build_const_shuffle1(gallivm, 0, 4), ""); + color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef, + lp_build_const_shuffle1(gallivm, 1, 4), ""); + color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef, + lp_build_const_shuffle1(gallivm, 0, 4), ""); + color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef, + lp_build_const_shuffle1(gallivm, 1, 4), ""); + code = LLVMBuildBitCast(builder, code, bld32.vec_type, ""); + + for (i = 0; i < 4; i++) { + /* select the colors */ + LLVMValueRef selmasklo, rgba01, rgba23, bitlo; + bitlo = bld32.one; + indices = LLVMBuildAnd(builder, code, bitlo, ""); + selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL, + indices, bitlo); + rgba01 = lp_build_select(&bld32, selmasklo, color1, color0); + + LLVMValueRef selmaskhi; + indices = LLVMBuildAnd(builder, code, const2, ""); + selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL, + indices, const2); + rgba23 = lp_build_select(&bld32, selmasklo, color3, color2); + rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01); + + /* + * Note that this will give "wrong" order. + * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ... + * This would be easily fixable by using different shuffle, bitlo/hi + * vectors above (and different shift), but seems slightly easier to + * deal with for dxt3/dxt5 alpha too. So instead change lookup. + */ + col[i] = rgba; + code = LLVMBuildLShr(builder, code, const2, ""); + } + } +} + +/* + * decode one dxt3 block. + */ +static void +s3tc_decode_block_dxt3(struct gallivm_state *gallivm, + enum pipe_format format, + LLVMValueRef dxt_block, + LLVMValueRef *col) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi; + struct lp_type type32, type8, type16; + unsigned i; + + memset(&type32, 0, sizeof type32); + type32.width = 32; + type32.length = 4; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 16; + + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 8; + + s3tc_decode_block_dxt1(gallivm, format, dxt_block, col); + + shift4_16 = lp_build_const_int_vec(gallivm, type16, 4); + mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000); + + alpha = LLVMBuildBitCast(builder, dxt_block, + lp_build_vec_type(gallivm, type8), ""); + alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0); + alpha = LLVMBuildBitCast(builder, alpha, + lp_build_vec_type(gallivm, type16), ""); + alpha = LLVMBuildAnd(builder, alpha, + lp_build_const_int_vec(gallivm, type16, 0xf00f), ""); + alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, ""); + alphas1 = LLVMBuildShl(builder, alpha, shift4_16, ""); + alpha = LLVMBuildOr(builder, alphas0, alpha, ""); + alpha = LLVMBuildOr(builder, alphas1, alpha, ""); + alpha = LLVMBuildBitCast(builder, alpha, + lp_build_vec_type(gallivm, type32), ""); + /* + * alpha now contains elems 0,1,2,3,... (ubytes) + * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which + * is just as easy as "natural" order - 3 shift/and instead of 6 unpack). + */ + a[0] = LLVMBuildShl(builder, alpha, + lp_build_const_int_vec(gallivm, type32, 24), ""); + a[1] = LLVMBuildShl(builder, alpha, + lp_build_const_int_vec(gallivm, type32, 16), ""); + a[1] = LLVMBuildAnd(builder, a[1], mask8hi, ""); + a[2] = LLVMBuildShl(builder, alpha, + lp_build_const_int_vec(gallivm, type32, 8), ""); + a[2] = LLVMBuildAnd(builder, a[2], mask8hi, ""); + a[3] = LLVMBuildAnd(builder, alpha, mask8hi, ""); + + for (i = 0; i < 4; i++) { + col[i] = LLVMBuildOr(builder, col[i], a[i], ""); + } +} + + +static LLVMValueRef +lp_build_lerpdxta_block(struct gallivm_state *gallivm, + LLVMValueRef alpha0, + LLVMValueRef alpha1, + LLVMValueRef code, + LLVMValueRef sel_mask) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef delta, ainterp; + LLVMValueRef weight5, weight7, weight; + struct lp_type type16; + struct lp_build_context bld; + + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 8; + type16.sign = TRUE; + + lp_build_context_init(&bld, gallivm, type16); + /* + * 256/7 is only 36.57 so we'd lose quite some precision. Since it would + * actually be desirable to do this here with even higher accuracy than + * even 8 bit (more or less required for rgtc, albeit that's not handled + * here right now), shift the weights after multiplication by code. + */ + weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5); + weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7); + weight = lp_build_select(&bld, sel_mask, weight7, weight5); + + /* + * we'll get garbage in the elements which had code 0 (or larger than + * 5 or 7) but we don't care (or rather, need to fix up anyway). + */ + code = LLVMBuildSub(builder, code, bld.one, ""); + + weight = LLVMBuildMul(builder, weight, code, ""); + weight = LLVMBuildLShr(builder, weight, + lp_build_const_int_vec(gallivm, type16, 6), ""); + + delta = LLVMBuildSub(builder, alpha1, alpha0, ""); + + ainterp = LLVMBuildMul(builder, delta, weight, ""); + ainterp = LLVMBuildLShr(builder, ainterp, + lp_build_const_int_vec(gallivm, type16, 8), ""); + + /* lerp is done later (with packed values) */ + + return ainterp; +} + + +/* + * decode one dxt5 block. + */ +static void +s3tc_decode_block_dxt5(struct gallivm_state *gallivm, + enum pipe_format format, + LLVMValueRef dxt_block, + LLVMValueRef *col) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef alpha, alpha0, alpha1, ares; + LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2; + LLVMValueRef a[4], acode, tmp0, tmp1; + LLVMTypeRef i64t, i32t; + struct lp_type type32, type64, type8, type16; + struct lp_build_context bld16, bld8; + unsigned i; + + memset(&type32, 0, sizeof type32); + type32.width = 32; + type32.length = 4; + + memset(&type64, 0, sizeof type64); + type64.width = 64; + type64.length = 2; + + memset(&type8, 0, sizeof type8); + type8.width = 8; + type8.length = 16; + + memset(&type16, 0, sizeof type16); + type16.width = 16; + type16.length = 8; + + lp_build_context_init(&bld16, gallivm, type16); + lp_build_context_init(&bld8, gallivm, type8); + + i64t = lp_build_vec_type(gallivm, type64); + i32t = lp_build_vec_type(gallivm, type32); + + s3tc_decode_block_dxt1(gallivm, format, dxt_block, col); + + /* + * three possible strategies for vectorizing alpha: + * 1) compute all 8 values then use scalar extraction + * (i.e. have all 8 alpha values packed in one 64bit scalar + * and do something like ax = vals >> (codex * 8) followed + * by inserting these values back into color) + * 2) same as 8 but just use pshufb as a mini-LUT for selection. + * (without pshufb would need boatloads of cmp/selects trying to + * keep things vectorized for essentially scalar selection). + * 3) do something similar to the uncached case + * needs more calculations (need to calc 16 values instead of 8 though + * that's only an issue for the lerp which we need to do twice otherwise + * everything still fits into 128bit) but keeps things vectorized mostly. + * Trying 3) here though not sure it's really faster... + * With pshufb, we try 2) (cheaper and more accurate) + */ + + /* + * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't + * help since code crosses 8bit boundaries). But variable shifts are + * AVX2 only, and even then only dword/quadword (intel _really_ hates + * shifts!). Instead, emulate by 16bit muls. + * Also, the required byte shuffles are essentially non-emulatable, so + * require ssse3 (albeit other archs might do them fine). + * This is not directly tied to ssse3 - just need sane byte shuffles. + * But ordering is going to be different below so use same condition. + */ + + + /* vectorize alpha */ + alpha = LLVMBuildBitCast(builder, dxt_block, i64t, ""); + alpha0 = LLVMBuildAnd(builder, alpha, + lp_build_const_int_vec(gallivm, type64, 0xff), ""); + alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, ""); + alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, ""); + alpha1 = LLVMBuildLShr(builder, alpha, + lp_build_const_int_vec(gallivm, type16, 8), ""); + alpha = LLVMBuildBitCast(builder, alpha, i64t, ""); + shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8); + /* XXX this shuffle broken with LLVM 2.8 */ + alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, ""); + alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, ""); + + type16.sign = TRUE; + sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER, + alpha0, alpha1); + type16.sign = FALSE; + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, ""); + + if (!util_cpu_caps.has_ssse3) { + LLVMValueRef acodeg, mask1, acode0, acode1; + + /* extraction of the 3 bit values into something more useful is HARD */ + /* first steps are actually scalar */ + acode = LLVMBuildLShr(builder, alpha, + lp_build_const_int_vec(gallivm, type64, 16), ""); + tmp0 = LLVMBuildAnd(builder, acode, + lp_build_const_int_vec(gallivm, type64, 0xffffff), ""); + tmp1 = LLVMBuildLShr(builder, acode, + lp_build_const_int_vec(gallivm, type64, 24), ""); + tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, ""); + tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, ""); + acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0); + /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */ + tmp0 = LLVMBuildAnd(builder, acode, + lp_build_const_int_vec(gallivm, type32, 0xfff), ""); + tmp1 = LLVMBuildLShr(builder, acode, + lp_build_const_int_vec(gallivm, type32, 12), ""); + acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0); + /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */ + tmp0 = LLVMBuildAnd(builder, acode, + lp_build_const_int_vec(gallivm, type32, 0x3f), ""); + tmp1 = LLVMBuildLShr(builder, acode, + lp_build_const_int_vec(gallivm, type32, 6), ""); + /* use signed pack doesn't matter and otherwise need sse41 */ + type32.sign = type16.sign = TRUE; + acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1); + type32.sign = type16.sign = FALSE; + /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */ + acode0 = LLVMBuildAnd(builder, acode, + lp_build_const_int_vec(gallivm, type16, 0x7), ""); + acode1 = LLVMBuildLShr(builder, acode, + lp_build_const_int_vec(gallivm, type16, 3), ""); + acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1); + /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */ + + acodeg = LLVMBuildAnd(builder, acode, + LLVMBuildNot(builder, sel_mask, ""), ""); + mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL, + acode, bld8.one); + + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, ""); + ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask); + ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask); + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, ""); + ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1); + alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0); + alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1); + ainterp = LLVMBuildAdd(builder, ainterp, alpha0, ""); + /* Fix up val01 */ + sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL, + acode, bld8.zero); + ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp); + ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp); + + /* fix up val67 if a0 <= a1 */ + sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL, + acodeg, lp_build_const_int_vec(gallivm, type8, 6)); + ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), ""); + sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL, + acodeg, lp_build_const_int_vec(gallivm, type8, 7)); + ares = LLVMBuildOr(builder, ares, sel_mask2, ""); + + /* unpack in right order (0,4,8,12,1,5,..) */ + /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */ + tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0); + tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1); + tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, ""); + tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, ""); + + a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0); + a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0); + a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1); + a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1); + } + else { + LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi; + LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context); + LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context); + unsigned i, j; + /* + * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't + * help since code crosses 8bit boundaries). But variable shifts are + * AVX2 only, and even then only dword/quadword (intel _really_ hates + * shifts!). Instead, emulate by 16bit muls. + * Also, the required byte shuffles are essentially non-emulatable, so + * require ssse3 (albeit other archs might do them fine, but the + * complete path is ssse3 only for now). + */ + for (i = 0, j = 0; i < 16; i += 8, j += 3) { + elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2); + elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3); + elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4); + } + shufa = LLVMConstVector(elems, 16); + alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, ""); + acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, ""); + acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, ""); + /* + * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask) + * Do the same for 1/3/5/7 (albeit still need mask there - ideally + * we'd place them into bits 4-7 so could save shift but impossible.) + */ + for (i = 0; i < 8; i += 4) { + elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0); + elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0); + elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0); + elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0); + } + mulclo = LLVMConstVector(elems, 8); + for (i = 0; i < 8; i += 4) { + elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0); + elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0); + elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0); + elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0); + } + mulchi = LLVMConstVector(elems, 8); + + tmp0 = LLVMBuildMul(builder, acode, mulclo, ""); + tmp1 = LLVMBuildMul(builder, acode, mulchi, ""); + tmp0 = LLVMBuildLShr(builder, tmp0, + lp_build_const_int_vec(gallivm, type16, 13), ""); + tmp1 = LLVMBuildLShr(builder, tmp1, + lp_build_const_int_vec(gallivm, type16, 5), ""); + tmp1 = LLVMBuildAnd(builder, tmp1, + lp_build_const_int_vec(gallivm, type16, 0x700), ""); + acode = LLVMBuildOr(builder, tmp0, tmp1, ""); + acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, ""); + + /* + * Note that ordering is different here to non-ssse3 path: + * 0/1/2/3/4/5... + */ + + LLVMValueRef weight0, weight1, weight, delta; + LLVMValueRef constff_elem7, const0_elem6; + /* weights, correctly rounded (round(256*x/7)) */ + elems[0] = LLVMConstInt(type16s, 256, 0); + elems[1] = LLVMConstInt(type16s, 0, 0); + elems[2] = LLVMConstInt(type16s, 219, 0); + elems[3] = LLVMConstInt(type16s, 183, 0); + elems[4] = LLVMConstInt(type16s, 146, 0); + elems[5] = LLVMConstInt(type16s, 110, 0); + elems[6] = LLVMConstInt(type16s, 73, 0); + elems[7] = LLVMConstInt(type16s, 37, 0); + weight0 = LLVMConstVector(elems, 8); + + elems[0] = LLVMConstInt(type16s, 256, 0); + elems[1] = LLVMConstInt(type16s, 0, 0); + elems[2] = LLVMConstInt(type16s, 205, 0); + elems[3] = LLVMConstInt(type16s, 154, 0); + elems[4] = LLVMConstInt(type16s, 102, 0); + elems[5] = LLVMConstInt(type16s, 51, 0); + elems[6] = LLVMConstInt(type16s, 0, 0); + elems[7] = LLVMConstInt(type16s, 0, 0); + weight1 = LLVMConstVector(elems, 8); + + weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, ""); + weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, ""); + weight = lp_build_select(&bld8, sel_mask, weight0, weight1); + weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, ""); + + for (i = 0; i < 16; i++) { + elems[i] = LLVMConstNull(type8s); + } + elems[7] = LLVMConstInt(type8s, 255, 0); + constff_elem7 = LLVMConstVector(elems, 16); + + for (i = 0; i < 16; i++) { + elems[i] = LLVMConstInt(type8s, 255, 0); + } + elems[6] = LLVMConstInt(type8s, 0, 0); + const0_elem6 = LLVMConstVector(elems, 16); + + /* standard simple lerp - but the version we need isn't available */ + delta = LLVMBuildSub(builder, alpha0, alpha1, ""); + ainterp = LLVMBuildMul(builder, delta, weight, ""); + ainterp = LLVMBuildLShr(builder, ainterp, + lp_build_const_int_vec(gallivm, type16, 8), ""); + ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, ""); + alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, ""); + ainterp = LLVMBuildAdd(builder, ainterp, alpha1, ""); + ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, ""); + ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef); + + /* fixing 0/0xff case is slightly more complex */ + constff_elem7 = LLVMBuildAnd(builder, constff_elem7, + LLVMBuildNot(builder, sel_mask, ""), ""); + const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, ""); + ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, ""); + ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, ""); + + /* now pick all 16 elements at once! */ + intrargs[0] = ainterp; + intrargs[1] = acode; + ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128", + bld8.vec_type, intrargs, 2, 0); + + ares = LLVMBuildBitCast(builder, ares, i32t, ""); + mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000); + a[0] = LLVMBuildShl(builder, ares, + lp_build_const_int_vec(gallivm, type32, 24), ""); + a[1] = LLVMBuildShl(builder, ares, + lp_build_const_int_vec(gallivm, type32, 16), ""); + a[1] = LLVMBuildAnd(builder, a[1], mask8hi, ""); + a[2] = LLVMBuildShl(builder, ares, + lp_build_const_int_vec(gallivm, type32, 8), ""); + a[2] = LLVMBuildAnd(builder, a[2], mask8hi, ""); + a[3] = LLVMBuildAnd(builder, ares, mask8hi, ""); + } + + for (i = 0; i < 4; i++) { + a[i] = LLVMBuildBitCast(builder, a[i], i32t, ""); + col[i] = LLVMBuildOr(builder, col[i], a[i], ""); + } +} + + +static void +generate_update_cache_one_block(struct gallivm_state *gallivm, + LLVMValueRef function, + const struct util_format_description *format_desc) +{ + LLVMBasicBlockRef block; + LLVMBuilderRef old_builder; + LLVMValueRef ptr_addr; + LLVMValueRef hash_index; + LLVMValueRef cache; + LLVMValueRef dxt_block, tag_value; + LLVMValueRef col[LP_MAX_VECTOR_LENGTH]; + + ptr_addr = LLVMGetParam(function, 0); + hash_index = LLVMGetParam(function, 1); + cache = LLVMGetParam(function, 2); + + lp_build_name(ptr_addr, "ptr_addr" ); + lp_build_name(hash_index, "hash_index"); + lp_build_name(cache, "cache_addr"); + + /* + * Function body + */ + + old_builder = gallivm->builder; + block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry"); + gallivm->builder = LLVMCreateBuilderInContext(gallivm->context); + LLVMPositionBuilderAtEnd(gallivm->builder, block); + + lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block, + ptr_addr); + + switch (format_desc->format) { + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col); + break; + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT3_SRGBA: + s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col); + break; + case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_DXT5_SRGBA: + s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col); + break; + default: + assert(0); + s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col); + break; + } + + tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr, + LLVMInt64TypeInContext(gallivm->context), ""); + s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache); + + LLVMBuildRetVoid(gallivm->builder); + + LLVMDisposeBuilder(gallivm->builder); + gallivm->builder = old_builder; + + gallivm_verify_function(gallivm, function); +} + + +static void +update_cached_block(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + LLVMValueRef ptr_addr, + LLVMValueRef hash_index, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMModuleRef module = gallivm->module; + char name[256]; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMValueRef function, inst; + LLVMBasicBlockRef bb; + LLVMValueRef args[3]; + + util_snprintf(name, sizeof name, "%s_update_cache_one_block", + format_desc->short_name); + function = LLVMGetNamedFunction(module, name); + + if (!function) { + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[3]; + LLVMTypeRef function_type; + unsigned arg; + + /* + * Generate the function prototype. + */ + + ret_type = LLVMVoidTypeInContext(gallivm->context); + arg_types[0] = pi8t; + arg_types[1] = LLVMInt32TypeInContext(gallivm->context); + arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here + function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0); + function = LLVMAddFunction(module, name, function_type); + + for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg) + if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind) + lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS); + + LLVMSetFunctionCallConv(function, LLVMFastCallConv); + LLVMSetVisibility(function, LLVMHiddenVisibility); + generate_update_cache_one_block(gallivm, function, format_desc); + } + + args[0] = ptr_addr; + args[1] = hash_index; + args[2] = cache; + + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); + bb = LLVMGetInsertBlock(builder); + inst = LLVMGetLastInstruction(bb); + LLVMSetInstructionCallConv(inst, LLVMFastCallConv); +} + +/* + * cached lookup + */ +static LLVMValueRef +compressed_fetch_cached(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned count, low_bit, log2size; + LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp; + LLVMValueRef ij_index, hash_index, hash_mask, block_index; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); + struct lp_type type; + struct lp_build_context bld32; + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + lp_build_context_init(&bld32, gallivm, type); + + /* + * compute hash - we use direct mapped cache, the hash function could + * be better but it needs to be simple + * per-element: + * compare offset with offset stored at tag (hash) + * if not equal extract block, store block, update tag + * extract color from cache + * assemble colors + */ + + low_bit = util_logbase2(format_desc->block.bits / 8); + log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE); + addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, ""); + ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, ""); + ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc); + /* For the hash function, first mask off the unused lowest bits. Then just + do some xor with address bits - only use lower 32bits */ + ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, ""); + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, low_bit), ""); + /* This only really makes sense for size 64,128,256 */ + hash_index = ptr_addrtrunc; + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, 2*log2size), ""); + hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, ""); + tmp = LLVMBuildLShr(builder, hash_index, + lp_build_const_int_vec(gallivm, type, log2size), ""); + hash_index = LLVMBuildXor(builder, hash_index, tmp, ""); + + hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1); + hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, ""); + ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), ""); + ij_index = LLVMBuildAdd(builder, ij_index, j, ""); + block_index = LLVMBuildShl(builder, hash_index, + lp_build_const_int_vec(gallivm, type, 4), ""); + block_index = LLVMBuildAdd(builder, ij_index, block_index, ""); + + if (n > 1) { + color = bld32.undef; + for (count = 0; count < n; count++) { + LLVMValueRef index, cond, colorx; + LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx; + struct lp_build_if_state if_ctx; + + index = lp_build_const_int32(gallivm, count); + offsetx = LLVMBuildExtractElement(builder, offset, index, ""); + addrx = LLVMBuildZExt(builder, offsetx, i64t, ""); + addrx = LLVMBuildAdd(builder, addrx, addr, ""); + block_indexx = LLVMBuildExtractElement(builder, block_index, index, ""); + hash_indexx = LLVMBuildLShr(builder, block_indexx, + lp_build_const_int32(gallivm, 4), ""); + offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + ptr_addrx = LLVMBuildIntToPtr(builder, addrx, + LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + s3tc_update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx); + + color = LLVMBuildInsertElement(builder, color, colorx, + lp_build_const_int32(gallivm, count), ""); + } + } + else { + LLVMValueRef cond; + struct lp_build_if_state if_ctx; + + tmp = LLVMBuildZExt(builder, offset, i64t, ""); + addr = LLVMBuildAdd(builder, tmp, addr, ""); + offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, tmp, hash_index, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + s3tc_update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + color = s3tc_lookup_cached_pixel(gallivm, cache, block_index); + } +#if LP_BUILD_FORMAT_CACHE_DEBUG + s3tc_update_cache_access(gallivm, cache, n, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL); +#endif + return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), ""); +} + + +static LLVMValueRef +s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm, + unsigned n, + enum pipe_format format, + LLVMValueRef colors, + LLVMValueRef codewords, + LLVMValueRef alpha_lo, + LLVMValueRef alpha_hi, + LLVMValueRef i, + LLVMValueRef j) +{ + return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors, + codewords, alpha_lo, alpha_hi, i, j); +} + + +/** + * @param n number of pixels processed (usually n=4, but it should also work with n=1 + * and multiples of 4) + * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture) + * @param offset <n x i32> vector with the relative offsets of the S3TC blocks + * @param i is a <n x i32> vector with the x subpixel coordinate (0..3) + * @param j is a <n x i32> vector with the y subpixel coordinate (0..3) + * @return a <4*n x i8> vector with the pixel RGBA values in AoS + */ +LLVMValueRef +lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache) +{ + LLVMValueRef rgba; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMBuilderRef builder = gallivm->builder; + + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); + assert(format_desc->block.width == 4); + assert(format_desc->block.height == 4); + + assert((n == 1) || (n % 4 == 0)); + +/* debug_printf("format = %d\n", format_desc->format);*/ + if (cache) { + rgba = compressed_fetch_cached(gallivm, format_desc, n, + base_ptr, offset, i, j, cache); + return rgba; + } + + if (n > 4) { + unsigned count; + LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n); + LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128); + LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4); + LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext( + gallivm->context), 4); + LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16]; + struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128); + + assert(n / 4 <= ARRAY_SIZE(rgba4)); + + rgba = LLVMGetUndef(i128_vectype); + + for (count = 0; count < n / 4; count++) { + LLVMValueRef colors, codewords, alpha_lo, alpha_hi; + + i4 = lp_build_extract_range(gallivm, i, count * 4, 4); + j4 = lp_build_extract_range(gallivm, j, count * 4, 4); + offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4); + + lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords, + &alpha_lo, &alpha_hi, base_ptr, offset4); + + switch (format_desc->format) { + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format, + colors, codewords, i4, j4); + break; + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT3_SRGBA: + rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors, + codewords, alpha_lo, alpha_hi, i4, j4); + break; + case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_DXT5_SRGBA: + rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors, + codewords, alpha_lo, alpha_hi, i4, j4); + break; + default: + assert(0); + rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4)); + break; + } + /* shuffles typically give best results with dword elements...*/ + rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, ""); + } + rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4); + rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, ""); + } + else { + LLVMValueRef colors, codewords, alpha_lo, alpha_hi; + + lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords, + &alpha_lo, &alpha_hi, base_ptr, offset); + + switch (format_desc->format) { + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT1_SRGB: + case PIPE_FORMAT_DXT1_SRGBA: + rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format, + colors, codewords, i, j); + break; + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT3_SRGBA: + rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors, + codewords, alpha_lo, alpha_hi, i, j); + break; + case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_DXT5_SRGBA: + rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors, + codewords, alpha_lo, alpha_hi, i, j); + break; + default: + assert(0); + rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n)); + break; + } + } + + /* always return just decompressed values - srgb conversion is done later */ + + return rgba; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 018cca8f9df..a6662c5e01b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -3549,10 +3549,6 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, const struct util_format_description *format_desc; format_desc = util_format_description(static_texture_state->format); if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { - /* - * This is not 100% correct, if we have cache but the - * util_format_s3tc_prefer is true the cache won't get used - * regardless (could hook up the block decode there...) */ need_cache = TRUE; } } diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build index a4dbcf7b4ca..57f7e69050f 100644 --- a/src/gallium/auxiliary/meson.build +++ b/src/gallium/auxiliary/meson.build @@ -389,8 +389,8 @@ if with_llvm 'gallivm/lp_bld_flow.h', 'gallivm/lp_bld_format_aos_array.c', 'gallivm/lp_bld_format_aos.c', - 'gallivm/lp_bld_format_cached.c', 'gallivm/lp_bld_format_float.c', + 'gallivm/lp_bld_format_s3tc.c', 'gallivm/lp_bld_format.c', 'gallivm/lp_bld_format.h', 'gallivm/lp_bld_format_soa.c', |