summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorOded Gabbay <[email protected]>2015-12-29 18:12:34 +0200
committerOded Gabbay <[email protected]>2016-01-06 14:54:16 +0200
commit925c46cfc48042ec0bc5a83df962d2d7dd038394 (patch)
tree5e9b575a9b45c334d59125bff921d9c91851e21c /src
parent3bbe16ea79bb5738109df36780cc99119a006d91 (diff)
llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8
This patch converts the SSE-optimized build_mask_32() and build_mask_linear_32() to VMX/VSX. I measured the results on POWER8 machine with 32 cores at 3.4GHz and 16GB of RAM. FPS/Score Name Before After Delta ------------------------------------------------ glmark2 (score) 139.8 142.7 2.07% openarena and xonotic didn't show a significant (more than 1%) difference. v2: Make sure code is build only on POWER8 LE machine Signed-off-by: Oded Gabbay <[email protected]> Reviewed-by: Roland Scheidegger <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/llvmpipe/lp_rast_tri.c150
1 files changed, 110 insertions, 40 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c9b9221d87c..09a182ac84a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
lp_rast_triangle_4(task, arg2);
}
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
-{
- union lp_rast_cmd_arg arg2;
- arg2.triangle.tri = arg.triangle.tri;
- arg2.triangle.plane_mask = (1<<3)-1;
- lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
-{
- union lp_rast_cmd_arg arg2;
- arg2.triangle.tri = arg.triangle.tri;
- arg2.triangle.plane_mask = (1<<4)-1;
- lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
-{
- lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
#include <emmintrin.h>
#include "util/u_sse.h"
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
#define NR_PLANES 3
-
-
-
-
-
-
void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
0xffff & ~out[i].mask);
}
-
-
-
-
void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
@@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
}
#undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include <altivec.h>
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_32(int c,
+ int cdiff,
+ int dcdx,
+ int dcdy,
+ unsigned *outmask,
+ unsigned *partmask)
+{
+ __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+ __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+ /* Get values across the quad
+ */
+ __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+ __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+ __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+ {
+ __m128i cstep01, cstep23, result;
+
+ cstep01 = vec_packs_epi32(cstep0, cstep1);
+ cstep23 = vec_packs_epi32(cstep2, cstep3);
+ result = vec_packs_epi16(cstep01, cstep23);
+
+ *outmask |= vec_movemask_epi8(result);
+ }
+
+
+ {
+ __m128i cio4 = (__m128i) vec_splats(cdiff);
+ __m128i cstep01, cstep23, result;
+
+ cstep0 = vec_add_epi32(cstep0, cio4);
+ cstep1 = vec_add_epi32(cstep1, cio4);
+ cstep2 = vec_add_epi32(cstep2, cio4);
+ cstep3 = vec_add_epi32(cstep3, cio4);
+
+ cstep01 = vec_packs_epi32(cstep0, cstep1);
+ cstep23 = vec_packs_epi32(cstep2, cstep3);
+ result = vec_packs_epi16(cstep01, cstep23);
+
+ *partmask |= vec_movemask_epi8(result);
+ }
+}
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+ __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+ __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+ /* Get values across the quad
+ */
+ __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+ __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+ __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+ /* pack pairs of results into epi16
+ */
+ __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+ __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+ /* pack into epi8, preserving sign bits
+ */
+ __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+ /* extract sign bits to create mask
+ */
+ return vec_movemask_epi8(result);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ union lp_rast_cmd_arg arg2;
+ arg2.triangle.tri = arg.triangle.tri;
+ arg2.triangle.plane_mask = (1<<3)-1;
+ lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ union lp_rast_cmd_arg arg2;
+ arg2.triangle.tri = arg.triangle.tri;
+ arg2.triangle.plane_mask = (1<<4)-1;
+ lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+ lp_rast_triangle_32_3_16(task, arg);
+}
+
#endif
@@ -512,7 +582,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
#define NR_PLANES 8
#include "lp_rast_tri_tmp.h"
-#ifdef PIPE_ARCH_SSE
+#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
#undef BUILD_MASKS
#undef BUILD_MASK_LINEAR
#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)