4 files changed, 210 insertions, 152 deletions
diff --git a/src/gallium/drivers/lima/lima_resource.c b/src/gallium/drivers/lima/lima_resource.c
index 2b86466101e..a0edbe92e01 100644
--- a/src/gallium/drivers/lima/lima_resource.c
+++ b/src/gallium/drivers/lima/lima_resource.c
@@ -636,7 +636,7 @@ lima_transfer_map(struct pipe_context *pctx,
                ptrans->box.width, ptrans->box.height,
                ptrans->stride,
                res->levels[level].stride,
-               util_format_get_blocksize(pres->format));
+               pres->format);
       }
 
       return trans->staging;
@@ -682,7 +682,7 @@ lima_transfer_unmap(struct pipe_context *pctx,
                ptrans->box.width, ptrans->box.height,
                res->levels[ptrans->level].stride,
                ptrans->stride,
-               util_format_get_blocksize(pres->format));
+               pres->format);
       }
       free(trans->staging);
    }
diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c
index a20bd79c1ad..18c6e05ba3b 100644
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -402,10 +402,12 @@ panfrost_resource_create_bo(struct panfrost_screen *screen, struct panfrost_reso
                 PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_DISPLAY_TARGET;
 
+        unsigned bpp = util_format_get_blocksizebits(res->format);
         bool is_2d = (res->target == PIPE_TEXTURE_2D);
+        bool is_sane_bpp = bpp == 8 || bpp == 16 || bpp == 32 || bpp == 64 || bpp == 128;
         bool should_tile = (res->usage != PIPE_USAGE_STREAM);
         bool must_tile = (res->bind & PIPE_BIND_DEPTH_STENCIL) && (screen->quirks & MIDGARD_SFBD);
-        bool can_tile = is_2d && ((res->bind & ~valid_binding) == 0);
+        bool can_tile = is_2d && is_sane_bpp && ((res->bind & ~valid_binding) == 0);
 
         /* FBOs we would like to checksum, if at all possible */
         bool can_checksum = !(res->bind & ~valid_binding);
@@ -667,7 +669,7 @@ panfrost_transfer_map(struct pipe_context *pctx,
                                         box->x, box->y, box->width, box->height,
                                         transfer->base.stride,
                                         rsrc->slices[level].stride,
-                                        util_format_get_blocksize(resource->format));
+                                        resource->format);
                         }
                 }
 
@@ -722,7 +724,7 @@ panfrost_transfer_unmap(struct pipe_context *pctx,
                                         transfer->box.width, transfer->box.height,
                                         prsrc->slices[transfer->level].stride,
                                         transfer->stride,
-                                        util_format_get_blocksize(prsrc->base.format));
+                                        prsrc->base.format);
                         }
                 }
         }
diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c
index 158fde9718a..01cd4ca6657 100644
--- a/src/panfrost/shared/pan_tiling.c
+++ b/src/panfrost/shared/pan_tiling.c
@@ -27,7 +27,7 @@
 
 #include "pan_tiling.h"
 #include <stdbool.h>
-#include <assert.h>
+#include "util/macros.h"
 
 /* This file implements software encode/decode of the tiling format used for
  * textures and framebuffers primarily on Utgard GPUs. Names for this format
@@ -83,7 +83,7 @@
  * 0b11001100. The idea is that for the bits in the solely Y place, we
  * get a Y place, and the bits in the XOR place *also* get a Y. */
 
-uint32_t bit_duplication[16] = {
+const uint32_t bit_duplication[16] = {
    0b00000000,
    0b00000011,
    0b00001100,
@@ -104,7 +104,7 @@ uint32_t bit_duplication[16] = {
 
 /* Space the bits out of a 4-bit nibble */
 
-unsigned space_4[16] = {
+const unsigned space_4[16] = {
    0b0000000,
    0b0000001,
    0b0000100,
@@ -129,69 +129,114 @@ unsigned space_4[16] = {
 #define TILE_HEIGHT 16
 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
 
-/* An optimized routine to tile an aligned (w & 0xF == 0) bpp4 texture */
-
-static void
-panfrost_store_tiled_image_bpp4(void *dst, const void *src,
-                               unsigned sx, unsigned sy,
-                               unsigned w, unsigned h,
-                               uint32_t dst_stride,
-                               uint32_t src_stride)
-{
-   /* Precompute the offset to the beginning of the first horizontal tile we're
-    * writing to, knowing that x is 16-aligned. Tiles themselves are
-    * stored linearly, so we get the X tile number by shifting and then
-    * multiply by the bytes per tile */
-
-   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * 4);
-
-   /* Iterate across the pixels we're trying to store in source-order */
-
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      /* For each pixel in the destination image, figure out the part
-       * corresponding to the 16x16 block index */
-
-      int block_y = y & ~0x0f;
-
-      /* In pixel coordinates (where the origin is the top-left), (block_y, 0)
-       * is the top-left corner of the leftmost tile in this row. While pixels
-       * are reordered within a block, the blocks themselves are stored
-       * linearly, so multiplying block_y by the pixel stride of the
-       * destination image equals the byte offset of that top-left corner of
-       * the block this row is in */
-
-      uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride));
-
-      /* The source is actually linear, so compute the byte offset to the start
-       * and end of this row in the source */
-
-      const uint32_t *source = src + (src_y * src_stride);
-      const uint32_t *source_end = source + w;
-
-      /* We want to duplicate the bits of the bottom nibble of Y */
-      unsigned expanded_y = bit_duplication[y & 0xF];
-
-      /* Iterate the row in source order. In the outer loop, we iterate 16
-       * bytes tiles. After each tile, we increment dest to include the size of
-       * that tile in pixels. */
-
-      for (; source < source_end; dest += PIXELS_PER_TILE) {
-         /* Within each tile, we iterate each of the 16 pixels in the row of
-          * the tile. This loop should be unrolled. */
-
-         for (int i = 0; i < 16; ++i) {
-            /* We have the X component spaced out in space_x and we have the Y
-             * component duplicated. So we just XOR them together. The X bits
-             * get the XOR like the pattern needs. The Y bits are XORing with
-             * zero so this is a no-op */
+/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
+ * only support copies and sizeof, so emulating with a packed structure works
+ * well enough, but if there's a native 128-bit type we may we well prefer
+ * that. */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t pan_uint128_t;
+#else
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} __attribute__((packed)) pan_uint128_t;
+#endif
+
+/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
+ *
+ * dest_start precomputes the offset to the beginning of the first horizontal
+ * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
+ * stored linearly, so we get the X tile number by shifting and then multiply
+ * by the bytes per tile .
+ *
+ * We iterate across the pixels we're trying to store in source-order. For each
+ * row in the destination image, we figure out which row of 16x16 block we're
+ * in, by slicing off the lower 4-bits (block_y).
+ *
+ * dest then precomputes the location of the top-left corner of the block the
+ * row starts in. In pixel coordinates (where the origin is the top-left),
+ * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
+ * pixels are reordered within a block, the blocks themselves are stored
+ * linearly, so multiplying block_y by the pixel stride of the destination
+ * image equals the byte offset of that top-left corner of the block this row
+ * is in.
+ *
+ * On the other hand, the source is linear so we compute the locations of the
+ * start and end of the row in the source by a simple linear addressing.
+ *
+ * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
+ * y0] value. Since this is constant across a row, we look it up per-row and
+ * store in expanded_y.
+ *
+ * Finally, we iterate each row in source order. In the outer loop, we iterate
+ * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
+ * be unrolled), calculating the index within the tile and writing.
+ */
 
-            unsigned index = expanded_y ^ space_4[i];
+#define TILED_STORE_TYPE(pixel_t, shift) \
+static void \
+panfrost_store_tiled_image_##pixel_t \
+                              (void *dst, const void *src, \
+                               uint16_t sx, uint16_t sy, \
+                               uint16_t w, uint16_t h, \
+                               uint32_t dst_stride, \
+                               uint32_t src_stride) \
+{ \
+   uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      uint16_t block_y = y & ~0x0f; \
+      uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \
+      const pixel_t *source = src + (src_y * src_stride); \
+      const pixel_t *source_end = source + w; \
+      unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
+      for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
+         for (uint8_t i = 0; i < 16; ++i) { \
+            unsigned index = expanded_y ^ (space_4[i] << shift); \
+            *((pixel_t *) (dest + index)) = *(source++); \
+         } \
+      } \
+   } \
+} \
+
+TILED_STORE_TYPE(uint8_t, 0);
+TILED_STORE_TYPE(uint16_t, 1);
+TILED_STORE_TYPE(uint32_t, 2);
+TILED_STORE_TYPE(uint64_t, 3);
+TILED_STORE_TYPE(pan_uint128_t, 4);
+
+#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
+   const unsigned mask = (1 << tile_shift) - 1; \
+   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
+      unsigned block_y = y & ~mask; \
+      unsigned block_start_s = block_y * dst_stride; \
+      unsigned source_start = src_y * src_stride; \
+      unsigned expanded_y = bit_duplication[y & mask]; \
+ \
+      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
+         unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
+         unsigned index = expanded_y ^ space_4[x & mask]; \
+         uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
+         uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
+ \
+         pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
+         pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
+         *outp = *inp; \
+      } \
+   } \
+}
 
-            /* Copy over the pixel */
-            dest[index] = *(source++);
-         }
-      }
-   }
+#define TILED_UNALIGNED_TYPES(store, shift) { \
+   if (bpp == 8) \
+      TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
+   else if (bpp == 16) \
+      TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
+   else if (bpp == 32) \
+      TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
+   else if (bpp == 64) \
+      TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
+   else if (bpp == 128) \
+      TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
 }
 
 static void
@@ -200,109 +245,118 @@ panfrost_access_tiled_image_generic(void *dst, void *src,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp,
-                               bool is_store)
+                               const struct util_format_description *desc,
+                               bool _is_store)
 {
-   for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) {
-      int block_y = y & ~0x0f;
-      int block_start_s = block_y * dst_stride;
-      int source_start = src_y * src_stride;
+   unsigned bpp = desc->block.bits;
+
+   if (desc->block.width > 1) {
+      w = DIV_ROUND_UP(w, desc->block.width);
+      h = DIV_ROUND_UP(h, desc->block.height);
+
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 2)
+      else
+         TILED_UNALIGNED_TYPES(false, 2)
+   } else {
+      if (_is_store)
+         TILED_UNALIGNED_TYPES(true, 4)
+      else
+         TILED_UNALIGNED_TYPES(false, 4)
+   }
+}
 
-      unsigned expanded_y = bit_duplication[y & 0xF];
+#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
 
-      for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) {
-         int block_x_s = (x >> 4) * 256;
+void
+panfrost_store_tiled_image(void *dst, const void *src,
+                           unsigned x, unsigned y,
+                           unsigned w, unsigned h,
+                           uint32_t dst_stride,
+                           uint32_t src_stride,
+                           enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
 
-         unsigned index = expanded_y ^ space_4[x & 0xF];
+   if (desc->block.width > 1) {
+      panfrost_access_tiled_image_generic(dst, (void *) src,
+            x, y, w, h,
+            dst_stride, src_stride, desc, true);
 
-         uint8_t *src8 = src;
-         uint8_t *source = &src8[source_start + bpp * src_x];
-         uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index);
+      return;
+   }
 
-         uint8_t *out = is_store ? dest : source;
-         uint8_t *in = is_store ? source : dest;
+   unsigned bpp = desc->block.bits;
+   unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
+   unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
+   unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
+   unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
 
-         uint16_t *out16 = (uint16_t *) out;
-         uint16_t *in16 = (uint16_t *) in;
+   /* First, tile the top portion */
 
-         uint32_t *out32 = (uint32_t *) out;
-         uint32_t *in32 = (uint32_t *) in;
+   unsigned orig_x = x, orig_y = y;
 
-         uint64_t *out64 = (uint64_t *) out;
-         uint64_t *in64 = (uint64_t *) in;
+   if (first_full_tile_y != y) {
+      unsigned dist = MIN2(first_full_tile_y - y, h);
 
-         /* Write out 1-16 bytes. Written like this rather than a loop so the
-          * compiler can see what's going on */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, w, dist,
+            dst_stride, src_stride, desc, true);  
 
-         switch (bpp) {
-            case 1:
-               out[0] = in[0];
-               break;
+      if (dist == h)
+         return;
 
-            case 2:
-               out16[0] = in16[0];
-               break;
+      y += dist;
+      h -= dist;
+   }
 
-            case 3:
-               out16[0] = in16[0];
-               out[2] = in[2];
-               break;
+   /* Next, the bottom portion */
+   if (last_full_tile_y != (y + h)) {
+      unsigned dist = (y + h) - last_full_tile_y;
 
-            case 4:
-               out32[0] = in32[0];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
+            x, last_full_tile_y, w, dist,
+            dst_stride, src_stride, desc, true);
 
-            case 6:
-               out32[0] = in32[0];
-               out16[2] = in16[2];
-               break;
+      h -= dist;
+   }
 
-            case 8:
-               out64[0] = in64[0];
-               break;
+   /* The left portion */
+   if (first_full_tile_x != x) {
+      unsigned dist = MIN2(first_full_tile_x - x, w);
 
-            case 12:
-               out64[0] = in64[0];
-               out32[2] = in32[2];
-               break;
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
+            x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-            case 16:
-               out64[0] = in64[0];
-               out64[1] = in64[1];
-               break;
+      if (dist == w)
+         return;
 
-            default:
-               assert(0); /* Invalid */
-         }
-      }
+      x += dist;
+      w -= dist;
    }
-}
 
-void
-panfrost_store_tiled_image(void *dst, const void *src,
-                           unsigned x, unsigned y,
-                           unsigned w, unsigned h,
-                           uint32_t dst_stride,
-                           uint32_t src_stride,
-                           uint32_t bpp)
-{
-   /* The optimized path is for aligned writes specifically */
-
-   if (x & 0xF || w & 0xF) {
-      panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-      return;
-   }
+   /* Finally, the right portion */
+   if (last_full_tile_x != (x + w)) {
+      unsigned dist = (x + w) - last_full_tile_x;
 
-   /* Attempt to use an optimized path if we have one */
+      panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
+            last_full_tile_x, y, dist, h,
+            dst_stride, src_stride, desc, true);
 
-   switch (bpp) {
-      case 4:
-         panfrost_store_tiled_image_bpp4(dst, (void *) src, x, y, w, h, dst_stride, src_stride);
-         break;
-      default:
-         panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true);
-         break;
+      w -= dist;
    }
+
+   if (bpp == 8)
+      panfrost_store_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 16)
+      panfrost_store_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 32)
+      panfrost_store_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 64)
+      panfrost_store_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
+   else if (bpp == 128)
+      panfrost_store_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride);
 }
 
 void
@@ -311,7 +365,8 @@ panfrost_load_tiled_image(void *dst, const void *src,
                            unsigned w, unsigned h,
                            uint32_t dst_stride,
                            uint32_t src_stride,
-                           uint32_t bpp)
+                           enum pipe_format format)
 {
-   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, bpp, false);
+   const struct util_format_description *desc = util_format_description(format);
+   panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, desc, false);
 }
diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h
index e13d50c41e4..d8591e6dbdd 100644
--- a/src/panfrost/shared/pan_tiling.h
+++ b/src/panfrost/shared/pan_tiling.h
@@ -28,19 +28,20 @@
 #define H_PANFROST_TILING
 
 #include <stdint.h>
+#include <util/format/u_format.h>
 
 void panfrost_load_tiled_image(void *dst, const void *src,
                                unsigned x, unsigned y,
                                unsigned w, unsigned h,
                                uint32_t dst_stride,
                                uint32_t src_stride,
-                               uint32_t bpp);
+                               enum pipe_format format);
 
 void panfrost_store_tiled_image(void *dst, const void *src,
                                 unsigned x, unsigned y,
                                 unsigned w, unsigned h,
                                 uint32_t dst_stride,
                                 uint32_t src_stride,
-                                uint32_t bpp);
+                                enum pipe_format format);
 
 #endif