panfrost: Rework linear<--->tiled conversions

There's a lot going on here (it's a ton of commits squashed together since otherwise this would be impossible to review...) 1. We have a fast path for linear->tiled for whole (aligned) tiles, but we have to use a slow path for unaligned accesses. We can get a pretty major win for partial updates by using this slow path simply on the borders of the update region, and then hit the fast path for the tile-aligned interior. This does require some shuffling. 2. Mark the LUTs constant, which allows the compiler to inline them, which pairs well with loop unrolling (eliminating the memory accesses and just becoming some immediates.. which are not as immediate on aarch64 as I'd like..) 3. Add fast path for bpp1/2/8/16. These use the same algorithm and we have native types for them, so may as well get the fast path. 4. Drop generic path for bpp != 1/2/8/16, since these formats are generally awful and there's no way to tile them efficienctly and honestly there's not a good reason too either. Lima doesn't support any of these formats; Panfrost can make the opinionated choice to make them linear. 5. Specialize the unaligned routines. They don't have to be fully generic, they just can't assume alignment. So now they should be nearly as fast as the aligned versions (which get some extra tricks to be even faster but the difference might be neglible on some workloads). 6. Specialize also for the size of the tile, to allow 4x4 tiling as well as 16x16 tiling. This allows compressed textures to be efficiently tiled with the same routines (so we add support for tiling ASTC/ETC textures while we're at it) Signed-off-by: Alyssa Rosenzweig <[email protected]> Reviewed-by: Vasily Khoruzhick <[email protected]> Tested-by: Vasily Khoruzhick <[email protected]> #lima on Mali400 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3414>
author: Alyssa Rosenzweig <[email protected]> 2020-01-15 13:15:01 -0500
committer: Alyssa Rosenzweig <[email protected]> 2020-01-21 08:35:19 -0500
commit: 2091d311c9d063138d5c84bbf4afe99ca864e597 (patch)
tree: 18617a10f291bf006810930d2967705538da27db /src/panfrost
parent: f2d876b2b2335397661d2bd7663a7b353a1b839e (diff)
2 files changed, 203 insertions, 147 deletions
diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c
index 158fde9718a..01cd4ca6657 100644
--- a/src/panfrost/shared/pan_tiling.c
+++ b/src/panfrost/shared/pan_tiling.c
@@ -27,7 +27,7 @@
 
 #include "pan_tiling.h"
 #include <stdbool.h>
-#include <assert.h>
+#include "util/macros.h"
 
 /* This file implements software encode/decode of the tiling format used for
  * textures and framebuffers primarily on Utgard GPUs. Names for this format
@@ -83,7 +83,7 @@
  * 0b11001100. The idea is that for the bits in the solely Y place, we
  * get a Y place, and the bits in the XOR place *also* get a Y. */
 
-uint32_t bit_duplication[16] = {
+const uint32_t bit_duplication[16] = {
    0b00000000,
    0b00000011,
    0b00001100,
@@ -104,7 +104,7 @@ uint32_t bit_duplication[16] = {
 
 /* Space the bits out of a 4-bit nibble */
 
-unsigned space_4[16] = {
+const unsigned space_4[16] = {
    0b0000000,
    0b0000001,
    0b0000100,
@@ -129,69 +129,114 @@ unsigned space_4[16] = {
 #define TILE_HEIGHT 16
 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
 
-/* An optimized routine to tile an aligned (w & 0xF == 0) bpp4 texture */
-
-static void
-panfrost_store_tiled_image_bpp4(void *dst, const void *src,
-                               unsigned sx, unsigned sy,
-                               unsigned w, unsigned h,
-                               uint32_t dst_stride,
-                               uint32_t src_stride)
-{
-   /* Precompute the offset to the beginning of the first horizontal tile we're
-    * writing to, knowing that x is 16-aligned. Tiles themselves are
-    * stored linearly, so we get the X tile number by shifting and then
-    * multiply by the bytes per tile */
-
-   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * 4);
-
-   /* Iterate across the pixels we're trying to store in source-order */
-
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      /* For each pixel in the destination image, figure out the part
-       * corresponding to the 16x16 block index */
-
-      int block_y = y & ~0x0f;
-
-      /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
-       * is the top-left corner of the leftmost tile in this row. While pixels
-       * are reordered within a block, the blocks themselves are stored
-       * linearly, so multiplying block_y by the pixel stride of the
-       * destination image equals the byte offset of that top-left corner of
-       * the block this row is in */
-
-      uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride));
-
-      /* The source is actually linear, so compute the byte offset to the start
-       * and end of this row in the source */
-
-      const uint32_t *source = src + (src_y * src_stride);
-      const uint32_t *source_end = source + w;
-
-      /* We want to duplicate the bits of the bottom nibble of Y */
-      unsigned expanded_y = bit_duplication[y & 0xF];
-
-      /* Iterate the row in source order. In the outer loop, we iterate 16
-       * bytes tiles. After each tile, we increment dest to include the size of
-       * that tile in pixels. */
-
-      for (; source < source_end; dest += PIXELS_PER_TILE) {
-         /* Within each tile, we iterate each of the 16 pixels in the row of
-          * the tile. This loop should be unrolled. */
-
-         for (int i = 0; i < 16; ++i) {
-            /* We have the X component spaced out in space_x and we have the Y
-             * component duplicated. So we just XOR them together. The X bits
-             * get the XOR like the pattern needs. The Y bits are XORing with
-             * zero so this is a no-op */
+/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
+ * only support copies and sizeof, so emulating with a packed structure works
+ * well enough, but if there's a native 128-bit type we may we well prefer
+ * that. */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t pan_uint128_t;
+#else
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} __attribute__((packed)) pan_uint128_t;
+#endif
+
+/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
+ *
+ * dest_start precomputes the offset to the beginning of the first horizontal
+ * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
+ * stored linearly, so we get the X tile number by shifting and then multiply
+ * by the bytes per tile .
+ *
+ * We iterate across the pixels we're trying to store in source-order. For each
+ * row in the destination image, we figure out which row of 16x16 block we're
+ * in, by slicing off the lower 4-bits (block_y).
+ *
+ * dest then precomputes the location of the top-left corner of the block the
+ * row starts in. In pixel coordinates (where the origin is the top-left),
+ * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
+ * pixels are reordered within a block, the blocks themselves are stored
+ * linearly, so multiplying block_y by the pixel stride of the destination
+ * image equals the byte offset of that top-left corner of the block this row
+ * is in.
+ *
+ * On the other hand, the source is linear so we compute the locations of the
+ * start and end of the row in the source by a simple linear addressing.
+ *
+ * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
+ * y0] value. Since this is constant across a row, we look it up per-row and
+ * store in expanded_y.
+ *
+ * Finally, we iterate each row in source order. In the outer loop, we iterate
+ * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
+ * be unrolled), calculating the index within the tile and writing.
+ */
 
-            unsigned index = expanded_y ^ space_4[i];
+#define TILED_STORE_TYPE(pixel_t, shift) \
+static void \
+panfrost_store_tiled_image_##pixel_t \
+                              (void *dst, const void *src, \
+                               uint16_t sx, uint16_t sy, \
+                               uint16_t w, uint16_t h, \
+                               uint32_t dst_stride, \
+                               uint32_t src_stride) \
+{ \
+   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      uint16_t block_y = y & ~0x0f; \
+      uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
+      const pixel_t *source = src + (src_y * src_stride); \
+      const pixel_t *source_end = source + w; \
+      unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
+      for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
+         for (uint8_t i = 0; i < 16; ++i) { \
+            unsigned index = expanded_y ^ (space_4[i] << shift); \
+            *((pixel_t *) (dest + index)) = *(source++); \
+         } \
+      } \
+   } \
+} \
+
+TILED_STORE_TYPE(uint8_t, 0);
+TILED_STORE_TYPE(uint16_t, 1);
+TILED_STORE_TYPE(uint32_t, 2);
+TILED_STORE_TYPE(uint64_t, 3);
+TILED_STORE_TYPE(pan_uint128_t, 4);
+
+#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
+   const unsigned mask = (1 << tile_shift) - 1; \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      unsigned block_y = y & ~mask; \
+      unsigned block_start_s = block_y * dst_stride; \
+      unsigned source_start = src_y * src_stride; \
+      unsigned expanded_y = bit_duplication[y & mask]; \
+ \
+      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
+         unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
+         unsigned index = expanded_y ^ space_4[x & mask]; \
+         uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
+         uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
+ \
+         pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
+         pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
+         *outp = *inp; \
+      } \
+   } \
+}
 
-            /* Copy over the pixel */
-            dest[index] = *(source++);
-         }
-      }
-   }
+#define TILED_UNALIGNED_TYPES(store, shift) { \
+   if (bpp == 8) \
+      TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
+   else if (bpp == 16) \
+      TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
+   else if (bpp == 32) \
+      TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
+   else if (bpp == 64) \
+      TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
+   else if (bpp == 128) \
+      TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
 }
 
 static void
@@ -200,109 +245,118 @@ panfrost_access_tiled_image_generic(void *dst, void *src,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp,
-                               bool is_store)
+                               const struct util_format_description *desc,
+                               bool _is_store)
 {
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      int block_y = y & ~0x0f;
-      int block_start_s = block_y * dst_stride;
-      int source_start = src_y * src_stride;
+   unsigned bpp = desc->block.bits;
+
+   if (desc->block.width > 1) {
+      w = DIV_ROUND_UP(w, desc->block.width);
+      h = DIV_ROUND_UP(h, desc->block.height);
+
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 2)
+      else
+         TILED_UNALIGNED_TYPES(false, 2)
+   } else {
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 4)
+      else
+         TILED_UNALIGNED_TYPES(false, 4)
+   }
+}
 
-      unsigned expanded_y = bit_duplication[y & 0xF];
+#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
 
-      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) {
-         int block_x_s = (x >> 4) * 256;
+void
+panfrost_store_tiled_image(void *dst, const void *src,
+                           unsigned x, unsigned y,
+                           unsigned w, unsigned h,
+                           uint32_t dst_stride,
+                           uint32_t src_stride,
+                           enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
 
-         unsigned index = expanded_y ^ space_4[x & 0xF];
+   if (desc->block.width > 1) {
+      panfrost_access_tiled_image_generic(dst, (void *) src,
+            x, y, w, h,
+            dst_stride, src_stride, desc, true);
 
-         uint8_t *src8 = src;
-         uint8_t *source = &src8[source_start + bpp * src_x];
-         uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
+      return;
+   }
 
-         uint8_t *out = is_store ? dest : source;
-         uint8_t *in = is_store ? source : dest;
+   unsigned bpp = desc->block.bits;
+   unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
+   unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
+   unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
+   unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
 
-         uint16_t *out16 = (uint16_t *) out;
-         uint16_t *in16 = (uint16_t *) in;
+   /* First, tile the top portion */
 
-         uint32_t *out32 = (uint32_t *) out;
-         uint32_t *in32 = (uint32_t *) in;
+   unsigned orig_x = x, orig_y = y;
 
-         uint64_t *out64 = (uint64_t *) out;
-         uint64_t *in64 = (uint64_t *) in;
+   if (first_full_tile_y != y) {
+      unsigned dist = MIN2(first_full_tile_y - y, h);
 
-         /* Write out 1-16 bytes. Written like this rather than a loop so the
-          * compiler can see what's going on */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, w, dist,
+            dst_stride, src_stride, desc, true);  
 
-         switch (bpp) {
-            case 1:
-               out[0] = in[0];
-               break;
+      if (dist == h)
+         return;
 
-            case 2:
-               out16[0] = in16[0];
-               break;
+      y += dist;
+      h -= dist;
+   }
 
-            case 3:
-               out16[0] = in16[0];
-               out[2] = in[2];
-               break;
+   /* Next, the bottom portion */
+   if (last_full_tile_y != (y + h)) {
+      unsigned dist = (y + h) - last_full_tile_y;
 
-            case 4:
-               out32[0] = in32[0];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
+            x, last_full_tile_y, w, dist,
+            dst_stride, src_stride, desc, true);
 
-            case 6:
-               out32[0] = in32[0];
-               out16[2] = in16[2];
-               break;
+      h -= dist;
+   }
 
-            case 8:
-               out64[0] = in64[0];
-               break;
+   /* The left portion */
+   if (first_full_tile_x != x) {
+      unsigned dist = MIN2(first_full_tile_x - x, w);
 
-            case 12:
-               out64[0] = in64[0];
-               out32[2] = in32[2];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-            case 16:
-               out64[0] = in64[0];
-               out64[1] = in64[1];
-               break;
+      if (dist == w)
+         return;
 
-            default:
-               assert(0); /* Invalid */
-         }
-      }
+      x += dist;
+      w -= dist;
    }
-}
 
-void
-panfrost_store_tiled_image(void *dst, const void *src,
-                           unsigned x, unsigned y,
-                           unsigned w, unsigned h,
-                           uint32_t dst_stride,
-                           uint32_t src_stride,
-                           uint32_t bpp)
-{
-   /* The optimized path is for aligned writes specifically */
-
-   if (x & 0xF || w & 0xF) {
-      panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-      return;
-   }
+   /* Finally, the right portion */
+   if (last_full_tile_x != (x + w)) {
+      unsigned dist = (x + w) - last_full_tile_x;
 
-   /* Attempt to use an optimized path if we have one */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
+            last_full_tile_x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-   switch (bpp) {
-      case 4:
-         panfrost_store_tiled_image_bpp4(dst, (void *) src, x, y, w, h, dst_stride, src_stride);
-         break;
-      default:
-         panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-         break;
+      w -= dist;
    }
+
+   if (bpp == 8)
+      panfrost_store_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 16)
+      panfrost_store_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 32)
+      panfrost_store_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 64)
+      panfrost_store_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 128)
+      panfrost_store_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
 }
 
 void
@@ -311,7 +365,8 @@ panfrost_load_tiled_image(void *dst, const void *src,
                            unsigned w, unsigned h,
                            uint32_t dst_stride,
                            uint32_t src_stride,
-                           uint32_t bpp)
+                           enum pipe_format format)
 {
-   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, bpp, false);
+   const struct util_format_description *desc = util_format_description(format);
+   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, desc, false);
 }
diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h
index e13d50c41e4..d8591e6dbdd 100644
--- a/src/panfrost/shared/pan_tiling.h
+++ b/src/panfrost/shared/pan_tiling.h
@@ -28,19 +28,20 @@
 #define H_PANFROST_TILING
 
 #include <stdint.h>
+#include <util/format/u_format.h>
 
 void panfrost_load_tiled_image(void *dst, const void *src,
                                unsigned x, unsigned y,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp);
+                               enum pipe_format format);
 
 void panfrost_store_tiled_image(void *dst, const void *src,
                                 unsigned x, unsigned y,
                                 unsigned w, unsigned h,
                                 uint32_t dst_stride,
                                 uint32_t src_stride,
-                                uint32_t bpp);
+                                enum pipe_format format);
 
 #endif
author	Alyssa Rosenzweig <[email protected]>	2020-01-15 13:15:01 -0500
committer	Alyssa Rosenzweig <[email protected]>	2020-01-21 08:35:19 -0500
commit	2091d311c9d063138d5c84bbf4afe99ca864e597 (patch)
tree	18617a10f291bf006810930d2967705538da27db /src/panfrost
parent	f2d876b2b2335397661d2bd7663a7b353a1b839e (diff)