diff options
author | Oded Gabbay <[email protected]> | 2015-12-29 18:12:34 +0200 |
---|---|---|
committer | Oded Gabbay <[email protected]> | 2016-01-06 14:54:16 +0200 |
commit | 925c46cfc48042ec0bc5a83df962d2d7dd038394 (patch) | |
tree | 5e9b575a9b45c334d59125bff921d9c91851e21c /src | |
parent | 3bbe16ea79bb5738109df36780cc99119a006d91 (diff) |
llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8
This patch converts the SSE-optimized build_mask_32() and
build_mask_linear_32() to VMX/VSX.
I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.
FPS/Score
Name Before After Delta
------------------------------------------------
glmark2 (score) 139.8 142.7 2.07%
openarena and xonotic didn't show a significant (more than 1%)
difference.
v2: Make sure code is build only on POWER8 LE machine
Signed-off-by: Oded Gabbay <[email protected]>
Reviewed-by: Roland Scheidegger <[email protected]>
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_rast_tri.c | 150 |
1 files changed, 110 insertions, 40 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index c9b9221d87c..09a182ac84a 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task, lp_rast_triangle_4(task, arg2); } -#if !defined(PIPE_ARCH_SSE) +#if defined(PIPE_ARCH_SSE) -void -lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<3)-1; - lp_rast_triangle_32_3(task, arg2); -} - -void -lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - union lp_rast_cmd_arg arg2; - arg2.triangle.tri = arg.triangle.tri; - arg2.triangle.plane_mask = (1<<4)-1; - lp_rast_triangle_32_4(task, arg2); -} - -void -lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - lp_rast_triangle_32_3_16(task, arg); -} - -#else #include <emmintrin.h> #include "util/u_sse.h" @@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff) #define NR_PLANES 3 - - - - - - void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 0xffff & ~out[i].mask); } - - - - void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, } #undef NR_PLANES + +#else + +#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN) + +#include <altivec.h> +#include "util/u_pwr8.h" + +static inline void +build_masks_32(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + { + __m128i cstep01, cstep23, result; + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *outmask |= vec_movemask_epi8(result); + } + + + { + __m128i cio4 = (__m128i) vec_splats(cdiff); + __m128i cstep01, cstep23, result; + + cstep0 = vec_add_epi32(cstep0, cio4); + cstep1 = vec_add_epi32(cstep1, cio4); + cstep2 = vec_add_epi32(cstep2, cio4); + cstep3 = vec_add_epi32(cstep3, cio4); + + cstep01 = vec_packs_epi32(cstep0, cstep1); + cstep23 = vec_packs_epi32(cstep2, cstep3); + result = vec_packs_epi16(cstep01, cstep23); + + *partmask |= vec_movemask_epi8(result); + } +} + +static inline unsigned +build_mask_linear_32(int c, int dcdx, int dcdy) +{ + __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = (__m128i) vec_splats(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); + __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); + __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); + + /* pack pairs of results into epi16 + */ + __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); + __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); + + /* pack into epi8, preserving sign bits + */ + __m128i result = vec_packs_epi16(cstep01, cstep23); + + /* extract sign bits to create mask + */ + return vec_movemask_epi8(result); +} + +#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */ + +void +lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<3)-1; + lp_rast_triangle_32_3(task, arg2); +} + +void +lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + union lp_rast_cmd_arg arg2; + arg2.triangle.tri = arg.triangle.tri; + arg2.triangle.plane_mask = (1<<4)-1; + lp_rast_triangle_32_4(task, arg2); +} + +void +lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + lp_rast_triangle_32_3_16(task, arg); +} + #endif @@ -512,7 +582,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, #define NR_PLANES 8 #include "lp_rast_tri_tmp.h" -#ifdef PIPE_ARCH_SSE +#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)) #undef BUILD_MASKS #undef BUILD_MASK_LINEAR #define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) |