diff options
author | Oded Gabbay <[email protected]> | 2015-12-03 09:11:13 +0200 |
---|---|---|
committer | Oded Gabbay <[email protected]> | 2016-01-06 14:54:16 +0200 |
commit | e99555ef0bf1b786a1bf1e93f3304507dbb6e939 (patch) | |
tree | 52f453f225000a16e93846adce45cbd1ec66df97 /src/gallium/auxiliary | |
parent | afe88f66a8a9cf3c6bf6ea5d3e00589c22219c30 (diff) |
llvmpipe: add POWER8 portability file - u_pwr8.h
This file provides a portability layer that will make it easier to convert
SSE-based functions to VMX/VSX-based functions.
All the functions implemented in this file are prefixed using "vec_".
Therefore, when converting from SSE-based function, one needs to simply
replace the "_mm_" prefix of the SSE function being called to "vec_".
Having said that, not all functions could be converted as such, due to the
differences between the architectures. So, when doing such
conversion hurt the performance, I preferred to implement a more ad-hoc
solution. For example, converting the _mm_shuffle_epi32 needed to be done
using ad-hoc masks instead of a generic function.
All the functions in this file support both little-endian and big-endian
but currently the file is build only on POWER8 LE machine.
All of the functions are implemented using the Altivec/VMX intrinsics,
except one where I needed to use inline assembly (due to missing
intrinsic).
v2:
- Use vec_vgbbd instead of __builtin_vec_vgbbd
- Add an aligned load function
- Don't use typeof()
- Make file build only on POWER8 LE machine
Signed-off-by: Oded Gabbay <[email protected]>
Reviewed-by: Roland Scheidegger <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary')
-rw-r--r-- | src/gallium/auxiliary/util/u_pwr8.h | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/util/u_pwr8.h b/src/gallium/auxiliary/util/u_pwr8.h new file mode 100644 index 00000000000..1eca6d6df2c --- /dev/null +++ b/src/gallium/auxiliary/util/u_pwr8.h @@ -0,0 +1,310 @@ +/* + * Copyright 2015 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Author: Oded Gabbay <[email protected]> + */ + +/** + * @file + * POWER8 intrinsics portability header. + * + */ + +#ifndef U_PWR8_H_ +#define U_PWR8_H_ + +#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + +#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16))) + +typedef VECTOR_ALIGN_16 vector unsigned char __m128i; + +typedef VECTOR_ALIGN_16 union m128i { + __m128i m128i; + vector signed int m128si; + vector unsigned int m128ui; + ubyte ub[16]; + ushort us[8]; + int i[4]; + uint ui[4]; +} __m128i_union; + +static inline __m128i +vec_set_epi32 (int i3, int i2, int i1, int i0) +{ + __m128i_union vdst; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + vdst.i[0] = i0; + vdst.i[1] = i1; + vdst.i[2] = i2; + vdst.i[3] = i3; +#else + vdst.i[3] = i0; + vdst.i[2] = i1; + vdst.i[1] = i2; + vdst.i[0] = i3; +#endif + + return (__m128i) vdst.m128si; +} + +static inline __m128i +vec_setr_epi32 (int i0, int i1, int i2, int i3) +{ + return vec_set_epi32 (i3, i2, i1, i0); +} + +static inline __m128i +vec_unpacklo_epi32 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; +#else + {24, 25, 26, 27, 8, 9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpackhi_epi32 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; +#else + {16, 17, 18, 19, 0, 1, 2, 3, 20, 21, 22, 23, 4, 5, 6, 7}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpacklo_epi64 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; +#else + {24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_unpackhi_epi64 (__m128i even, __m128i odd) +{ + static const __m128i perm_mask = +#ifdef PIPE_ARCH_LITTLE_ENDIAN + { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; +#else + {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7}; +#endif + + return vec_perm (even, odd, perm_mask); +} + +static inline __m128i +vec_add_epi32 (__m128i a, __m128i b) +{ + return (__m128i) vec_add ((vector signed int) a, (vector signed int) b); +} + +static inline __m128i +vec_sub_epi32 (__m128i a, __m128i b) +{ + return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b); +} + +/* Call this function ONLY on POWER8 and newer platforms */ +static inline __m128i +vec_mullo_epi32 (__m128i a, __m128i b) +{ + __m128i v; + + __asm__( + "vmuluwm %0, %1, %2 \n" + : "=v" (v) + : "v" (a), "v" (b) + ); + + return v; +} + +static inline void +transpose4_epi32(const __m128i * restrict a, + const __m128i * restrict b, + const __m128i * restrict c, + const __m128i * restrict d, + __m128i * restrict o, + __m128i * restrict p, + __m128i * restrict q, + __m128i * restrict r) +{ + __m128i t0 = vec_unpacklo_epi32(*a, *b); + __m128i t1 = vec_unpacklo_epi32(*c, *d); + __m128i t2 = vec_unpackhi_epi32(*a, *b); + __m128i t3 = vec_unpackhi_epi32(*c, *d); + + *o = vec_unpacklo_epi64(t0, t1); + *p = vec_unpackhi_epi64(t0, t1); + *q = vec_unpacklo_epi64(t2, t3); + *r = vec_unpackhi_epi64(t2, t3); +} + +static inline __m128i +vec_slli_epi32 (__m128i vsrc, unsigned int count) +{ + __m128i_union vec_count; + + if (count >= 32) + return (__m128i) vec_splats (0); + else if (count == 0) + return vsrc; + + /* In VMX, all shift count fields must contain the same value */ + vec_count.m128si = (vector signed int) vec_splats (count); + return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui); +} + +static inline __m128i +vec_srli_epi32 (__m128i vsrc, unsigned int count) +{ + __m128i_union vec_count; + + if (count >= 32) + return (__m128i) vec_splats (0); + else if (count == 0) + return vsrc; + + /* In VMX, all shift count fields must contain the same value */ + vec_count.m128si = (vector signed int) vec_splats (count); + return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui); +} + +static inline __m128i +vec_srai_epi32 (__m128i vsrc, unsigned int count) +{ + __m128i_union vec_count; + + if (count >= 32) + return (__m128i) vec_splats (0); + else if (count == 0) + return vsrc; + + /* In VMX, all shift count fields must contain the same value */ + vec_count.m128si = (vector signed int) vec_splats (count); + return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui); +} + +static inline __m128i +vec_cmpeq_epi32 (__m128i a, __m128i b) +{ + return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b); +} + +static inline __m128i +vec_loadu_si128 (const uint32_t* src) +{ + __m128i_union vsrc; + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + + vsrc.m128ui = *((vector unsigned int *) src); + +#else + + __m128i vmask, tmp1, tmp2; + + vmask = vec_lvsl(0, src); + + tmp1 = (__m128i) vec_ld (0, src); + tmp2 = (__m128i) vec_ld (15, src); + vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask); + +#endif + + return vsrc.m128i; +} + +static inline __m128i +vec_load_si128 (const uint32_t* src) +{ + __m128i_union vsrc; + + vsrc.m128ui = *((vector unsigned int *) src); + + return vsrc.m128i; +} + +static inline void +vec_store_si128 (uint32_t* dest, __m128i vdata) +{ + vec_st ((vector unsigned int) vdata, 0, dest); +} + +/* Call this function ONLY on POWER8 and newer platforms */ +static inline int +vec_movemask_epi8 (__m128i vsrc) +{ + __m128i_union vtemp; + int result; + + vtemp.m128i = vec_vgbbd(vsrc); + +#ifdef PIPE_ARCH_LITTLE_ENDIAN + result = vtemp.ub[15] << 8 | vtemp.ub[7]; +#else + result = vtemp.ub[0] << 8 | vtemp.ub[8]; +#endif + + return result; +} + +static inline __m128i +vec_packs_epi16 (__m128i a, __m128i b) +{ +#ifdef PIPE_ARCH_LITTLE_ENDIAN + return (__m128i) vec_packs ((vector signed short) a, + (vector signed short) b); +#else + return (__m128i) vec_packs ((vector signed short) b, + (vector signed short) a); +#endif +} + +static inline __m128i +vec_packs_epi32 (__m128i a, __m128i b) +{ +#ifdef PIPE_ARCH_LITTLE_ENDIAN + return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b); +#else + return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a); +#endif +} + +#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ + +#endif /* U_PWR8_H_ */ |