llvmpipe: use vector loads for (optimized) tri raster funcs

When we switched to 64bit rasterization, we could no longer use straight aligned loads for loading the plane data. However, what the code actually does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then there's another 8 unpacks for the transpose we need (!). It would be possible to do the (scalar) loads of course already transposed (at least saving the additional unpacks), however instead just use (un)aligned vector loads, and recalculate the eo values, which is much less instructions (note in case of the triangle_32_3_4 case, the eo values are not even used, making the scalar loads + unpacks for them all the more pointless). This drops execution time of the triangle_32_3_4 function considerably, albeit it doesn't really make a measurable difference (for small tris we're essentially limited by vertex throughput in any case), for triangle_32_3_16 it's essentially noise (the loop is more costly than the initial code there). (I'm thinking about just ditching storing the eo values in the plane data, so could switch back to using aligned planes, however right now they are still used in the other raster functions dealing with planes with scalar code. Also not touching the ppc code, might not be that bad there in any case.) Reviewed-by: Brian Paul <[email protected]>
author: Roland Scheidegger <[email protected]> 2016-01-31 01:26:59 +0100
committer: Roland Scheidegger <[email protected]> 2016-02-02 05:58:19 +0100
commit: 8aa168eb8f0f1f634d0fc6733812c70d3a597518 (patch)
tree: 42ef9e82982f03834d226c766ad7a6ea0c34687d /src/gallium
parent: ab30426e335116e29473faaafe8b57ec760516ee (diff)
2 files changed, 24 insertions, 37 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index db45cbbb057..34008e1c01e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -308,17 +308,4 @@ void
 lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 
 
-#ifdef PIPE_ARCH_SSE
-#include <emmintrin.h>
-#include "util/u_sse.h"
-
-static inline __m128i
-lp_plane_to_m128i(const struct lp_rast_plane *plane)
-{
-   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
-                         (int32_t)plane->dcdy, (int32_t)plane->eo);
-}
-
-#endif
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 0ae6ec28d35..f4a2f0268f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
+                         const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
@@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    unsigned nr = 0;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
-   __m128i rej4;
-
-   __m128i dcdx2;
-   __m128i dcdx3;
+   __m128i c, dcdx, dcdy, rej4;
+   __m128i dcdx_neg_mask, dcdy_neg_mask;
+   __m128i dcdx2, dcdx3;
    
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
-   
+
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &rej4);
+                    &c, &unused, &dcdx, &dcdy);
+
+   /* recalc eo - easier than trying to load as scalars / shuffle... */
+   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                        _mm_and_si128(dcdx_neg_mask, dcdx));
 
    /* Adjust dcdx;
     */
@@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
 
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                     const union lp_rast_cmd_arg arg)
+                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
+   __m128i c, dcdx, dcdy;
+   __m128i dcdx2, dcdx3;
 
-   __m128i dcdx2;
-   __m128i dcdx3;
-   
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
 
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &unused);
+                    &c, &unused, &dcdx, &dcdy);
 
    /* Adjust dcdx;
     */
author	Roland Scheidegger <[email protected]>	2016-01-31 01:26:59 +0100
committer	Roland Scheidegger <[email protected]>	2016-02-02 05:58:19 +0100
commit	8aa168eb8f0f1f634d0fc6733812c70d3a597518 (patch)
tree	42ef9e82982f03834d226c766ad7a6ea0c34687d /src/gallium
parent	ab30426e335116e29473faaafe8b57ec760516ee (diff)