summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/llvmpipe
diff options
context:
space:
mode:
authorOded Gabbay <[email protected]>2015-12-13 17:49:32 +0200
committerOded Gabbay <[email protected]>2016-01-06 14:54:16 +0200
commit3bbe16ea79bb5738109df36780cc99119a006d91 (patch)
treea986f3612cd79a73e1c0b5d25d56e1508daef92f /src/gallium/drivers/llvmpipe
parente99555ef0bf1b786a1bf1e93f3304507dbb6e939 (diff)
llvmpipe: Optimize do_triangle_ccw for POWER8
This patch converts the SSE optimization done in do_triangle_ccw to VMX/VSX. I measured the results on POWER8 machine with 32 cores at 3.4GHz and 16GB of RAM. FPS/Score Name Before After Delta ------------------------------------------------ glmark2 (score) 136.6 139.8 2.34% openarena 16.14 16.35 1.30% xonotic 4.655 4.707 1.11% v2: - Convert loads to use aligned loads - Make sure code is build only on POWER8 LE machine Signed-off-by: Oded Gabbay <[email protected]> Reviewed-by: Roland Scheidegger <[email protected]>
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
-rw-r--r--src/gallium/drivers/llvmpipe/lp_setup_tri.c100
1 files changed, 100 insertions, 0 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b1671dd0ae2..0ff10a2027d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
#if defined(PIPE_ARCH_SSE)
#include <emmintrin.h>
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include <altivec.h>
+#include "util/u_pwr8.h"
#endif
static inline int
@@ -462,6 +465,103 @@ do_triangle_ccw(struct lp_setup_context *setup,
STORE_PLANE(plane[2], p2);
#undef STORE_PLANE
} else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+ /*
+ * XXX this code is effectively disabled for all practical purposes,
+ * as the allowed fb size is tiny if FIXED_ORDER is 8.
+ */
+ if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+ setup->fb.height <= MAX_FIXED_LENGTH32 &&
+ (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+ (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+ unsigned int bottom_edge;
+ __m128i vertx, verty;
+ __m128i shufx, shufy;
+ __m128i dcdx, dcdy, c;
+ __m128i unused;
+ __m128i dcdx_neg_mask;
+ __m128i dcdy_neg_mask;
+ __m128i dcdx_zero_mask;
+ __m128i top_left_flag;
+ __m128i c_inc_mask, c_inc;
+ __m128i eo, p0, p1, p2;
+ __m128i_union vshuf_mask;
+ __m128i zero = vec_splats((unsigned char) 0);
+ PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ vshuf_mask.i[0] = 0x07060504;
+ vshuf_mask.i[1] = 0x0B0A0908;
+ vshuf_mask.i[2] = 0x03020100;
+ vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+ vshuf_mask.i[0] = 0x00010203;
+ vshuf_mask.i[1] = 0x0C0D0E0F;
+ vshuf_mask.i[2] = 0x04050607;
+ vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+ /* vertex x coords */
+ vertx = vec_load_si128((const uint32_t *) position->x);
+ /* vertex y coords */
+ verty = vec_load_si128((const uint32_t *) position->y);
+
+ shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+ shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+ dcdx = vec_sub_epi32(verty, shufy);
+ dcdy = vec_sub_epi32(vertx, shufx);
+
+ dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+ dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+ dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+ bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+ top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+ c_inc_mask = vec_or(dcdx_neg_mask,
+ vec_and(dcdx_zero_mask,
+ vec_xor(dcdy_neg_mask,
+ top_left_flag)));
+
+ c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+ c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+ vec_mullo_epi32(dcdy, verty));
+
+ c = vec_add_epi32(c, c_inc);
+
+ /* Scale up to match c:
+ */
+ dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+ dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+ /* Calculate trivial reject values:
+ */
+ eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+ vec_and(dcdx_neg_mask, dcdx));
+
+ /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+ /* Pointless transpose which gets undone immediately in
+ * rasterization:
+ */
+ transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+ &p0, &p1, &p2, &unused);
+
+#define STORE_PLANE(plane, vec) do { \
+ vec_store_si128((uint32_t *)&temp_vec, vec); \
+ plane.c = (int64_t)temp_vec[0]; \
+ plane.dcdx = temp_vec[1]; \
+ plane.dcdy = temp_vec[2]; \
+ plane.eo = temp_vec[3]; \
+ } while(0)
+
+ STORE_PLANE(plane[0], p0);
+ STORE_PLANE(plane[1], p1);
+ STORE_PLANE(plane[2], p2);
+#undef STORE_PLANE
+ } else
#endif
{
int i;