diff options
-rw-r--r-- | src/gallium/drivers/lima/lima_resource.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/panfrost/pan_resource.c | 8 | ||||
-rw-r--r-- | src/panfrost/shared/pan_tiling.c | 345 | ||||
-rw-r--r-- | src/panfrost/shared/pan_tiling.h | 5 |
4 files changed, 210 insertions, 152 deletions
diff --git a/src/gallium/drivers/lima/lima_resource.c b/src/gallium/drivers/lima/lima_resource.c index 2b86466101e..a0edbe92e01 100644 --- a/src/gallium/drivers/lima/lima_resource.c +++ b/src/gallium/drivers/lima/lima_resource.c @@ -636,7 +636,7 @@ lima_transfer_map(struct pipe_context *pctx, ptrans->box.width, ptrans->box.height, ptrans->stride, res->levels[level].stride, - util_format_get_blocksize(pres->format)); + pres->format); } return trans->staging; @@ -682,7 +682,7 @@ lima_transfer_unmap(struct pipe_context *pctx, ptrans->box.width, ptrans->box.height, res->levels[ptrans->level].stride, ptrans->stride, - util_format_get_blocksize(pres->format)); + pres->format); } free(trans->staging); } diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c index a20bd79c1ad..18c6e05ba3b 100644 --- a/src/gallium/drivers/panfrost/pan_resource.c +++ b/src/gallium/drivers/panfrost/pan_resource.c @@ -402,10 +402,12 @@ panfrost_resource_create_bo(struct panfrost_screen *screen, struct panfrost_reso PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DISPLAY_TARGET; + unsigned bpp = util_format_get_blocksizebits(res->format); bool is_2d = (res->target == PIPE_TEXTURE_2D); + bool is_sane_bpp = bpp == 8 || bpp == 16 || bpp == 32 || bpp == 64 || bpp == 128; bool should_tile = (res->usage != PIPE_USAGE_STREAM); bool must_tile = (res->bind & PIPE_BIND_DEPTH_STENCIL) && (screen->quirks & MIDGARD_SFBD); - bool can_tile = is_2d && ((res->bind & ~valid_binding) == 0); + bool can_tile = is_2d && is_sane_bpp && ((res->bind & ~valid_binding) == 0); /* FBOs we would like to checksum, if at all possible */ bool can_checksum = !(res->bind & ~valid_binding); @@ -667,7 +669,7 @@ panfrost_transfer_map(struct pipe_context *pctx, box->x, box->y, box->width, box->height, transfer->base.stride, rsrc->slices[level].stride, - util_format_get_blocksize(resource->format)); + resource->format); } } @@ -722,7 +724,7 @@ panfrost_transfer_unmap(struct pipe_context *pctx, transfer->box.width, transfer->box.height, prsrc->slices[transfer->level].stride, transfer->stride, - util_format_get_blocksize(prsrc->base.format)); + prsrc->base.format); } } } diff --git a/src/panfrost/shared/pan_tiling.c b/src/panfrost/shared/pan_tiling.c index 158fde9718a..01cd4ca6657 100644 --- a/src/panfrost/shared/pan_tiling.c +++ b/src/panfrost/shared/pan_tiling.c @@ -27,7 +27,7 @@ #include "pan_tiling.h" #include <stdbool.h> -#include <assert.h> +#include "util/macros.h" /* This file implements software encode/decode of the tiling format used for * textures and framebuffers primarily on Utgard GPUs. Names for this format @@ -83,7 +83,7 @@ * 0b11001100. The idea is that for the bits in the solely Y place, we * get a Y place, and the bits in the XOR place *also* get a Y. */ -uint32_t bit_duplication[16] = { +const uint32_t bit_duplication[16] = { 0b00000000, 0b00000011, 0b00001100, @@ -104,7 +104,7 @@ uint32_t bit_duplication[16] = { /* Space the bits out of a 4-bit nibble */ -unsigned space_4[16] = { +const unsigned space_4[16] = { 0b0000000, 0b0000001, 0b0000100, @@ -129,69 +129,114 @@ unsigned space_4[16] = { #define TILE_HEIGHT 16 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT) -/* An optimized routine to tile an aligned (w & 0xF == 0) bpp4 texture */ - -static void -panfrost_store_tiled_image_bpp4(void *dst, const void *src, - unsigned sx, unsigned sy, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride) -{ - /* Precompute the offset to the beginning of the first horizontal tile we're - * writing to, knowing that x is 16-aligned. Tiles themselves are - * stored linearly, so we get the X tile number by shifting and then - * multiply by the bytes per tile */ - - uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * 4); - - /* Iterate across the pixels we're trying to store in source-order */ - - for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { - /* For each pixel in the destination image, figure out the part - * corresponding to the 16x16 block index */ - - int block_y = y & ~0x0f; - - /* In pixel coordinates (where the origin is the top-left), (block_y, 0) - * is the top-left corner of the leftmost tile in this row. While pixels - * are reordered within a block, the blocks themselves are stored - * linearly, so multiplying block_y by the pixel stride of the - * destination image equals the byte offset of that top-left corner of - * the block this row is in */ - - uint32_t *dest = (uint32_t *) (dest_start + (block_y * dst_stride)); - - /* The source is actually linear, so compute the byte offset to the start - * and end of this row in the source */ - - const uint32_t *source = src + (src_y * src_stride); - const uint32_t *source_end = source + w; - - /* We want to duplicate the bits of the bottom nibble of Y */ - unsigned expanded_y = bit_duplication[y & 0xF]; - - /* Iterate the row in source order. In the outer loop, we iterate 16 - * bytes tiles. After each tile, we increment dest to include the size of - * that tile in pixels. */ - - for (; source < source_end; dest += PIXELS_PER_TILE) { - /* Within each tile, we iterate each of the 16 pixels in the row of - * the tile. This loop should be unrolled. */ - - for (int i = 0; i < 16; ++i) { - /* We have the X component spaced out in space_x and we have the Y - * component duplicated. So we just XOR them together. The X bits - * get the XOR like the pattern needs. The Y bits are XORing with - * zero so this is a no-op */ +/* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must + * only support copies and sizeof, so emulating with a packed structure works + * well enough, but if there's a native 128-bit type we may we well prefer + * that. */ + +#ifdef __SIZEOF_INT128__ +typedef __uint128_t pan_uint128_t; +#else +typedef struct { + uint64_t lo; + uint64_t hi; +} __attribute__((packed)) pan_uint128_t; +#endif + +/* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation: + * + * dest_start precomputes the offset to the beginning of the first horizontal + * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are + * stored linearly, so we get the X tile number by shifting and then multiply + * by the bytes per tile . + * + * We iterate across the pixels we're trying to store in source-order. For each + * row in the destination image, we figure out which row of 16x16 block we're + * in, by slicing off the lower 4-bits (block_y). + * + * dest then precomputes the location of the top-left corner of the block the + * row starts in. In pixel coordinates (where the origin is the top-left), + * (block_y, 0) is the top-left corner of the leftmost tile in this row. While + * pixels are reordered within a block, the blocks themselves are stored + * linearly, so multiplying block_y by the pixel stride of the destination + * image equals the byte offset of that top-left corner of the block this row + * is in. + * + * On the other hand, the source is linear so we compute the locations of the + * start and end of the row in the source by a simple linear addressing. + * + * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0 + * y0] value. Since this is constant across a row, we look it up per-row and + * store in expanded_y. + * + * Finally, we iterate each row in source order. In the outer loop, we iterate + * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should + * be unrolled), calculating the index within the tile and writing. + */ - unsigned index = expanded_y ^ space_4[i]; +#define TILED_STORE_TYPE(pixel_t, shift) \ +static void \ +panfrost_store_tiled_image_##pixel_t \ + (void *dst, const void *src, \ + uint16_t sx, uint16_t sy, \ + uint16_t w, uint16_t h, \ + uint32_t dst_stride, \ + uint32_t src_stride) \ +{ \ + uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + uint16_t block_y = y & ~0x0f; \ + uint8_t *dest = (uint8_t *) (dest_start + (block_y * dst_stride)); \ + const pixel_t *source = src + (src_y * src_stride); \ + const pixel_t *source_end = source + w; \ + unsigned expanded_y = bit_duplication[y & 0xF] << shift; \ + for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \ + for (uint8_t i = 0; i < 16; ++i) { \ + unsigned index = expanded_y ^ (space_4[i] << shift); \ + *((pixel_t *) (dest + index)) = *(source++); \ + } \ + } \ + } \ +} \ + +TILED_STORE_TYPE(uint8_t, 0); +TILED_STORE_TYPE(uint16_t, 1); +TILED_STORE_TYPE(uint32_t, 2); +TILED_STORE_TYPE(uint64_t, 3); +TILED_STORE_TYPE(pan_uint128_t, 4); + +#define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \ + const unsigned mask = (1 << tile_shift) - 1; \ + for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \ + unsigned block_y = y & ~mask; \ + unsigned block_start_s = block_y * dst_stride; \ + unsigned source_start = src_y * src_stride; \ + unsigned expanded_y = bit_duplication[y & mask]; \ + \ + for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \ + unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \ + unsigned index = expanded_y ^ space_4[x & mask]; \ + uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \ + uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \ + \ + pixel_t *outp = (pixel_t *) (is_store ? dest : source); \ + pixel_t *inp = (pixel_t *) (is_store ? source : dest); \ + *outp = *inp; \ + } \ + } \ +} - /* Copy over the pixel */ - dest[index] = *(source++); - } - } - } +#define TILED_UNALIGNED_TYPES(store, shift) { \ + if (bpp == 8) \ + TILED_UNALIGNED_TYPE(uint8_t, store, shift) \ + else if (bpp == 16) \ + TILED_UNALIGNED_TYPE(uint16_t, store, shift) \ + else if (bpp == 32) \ + TILED_UNALIGNED_TYPE(uint32_t, store, shift) \ + else if (bpp == 64) \ + TILED_UNALIGNED_TYPE(uint64_t, store, shift) \ + else if (bpp == 128) \ + TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \ } static void @@ -200,109 +245,118 @@ panfrost_access_tiled_image_generic(void *dst, void *src, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp, - bool is_store) + const struct util_format_description *desc, + bool _is_store) { - for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { - int block_y = y & ~0x0f; - int block_start_s = block_y * dst_stride; - int source_start = src_y * src_stride; + unsigned bpp = desc->block.bits; + + if (desc->block.width > 1) { + w = DIV_ROUND_UP(w, desc->block.width); + h = DIV_ROUND_UP(h, desc->block.height); + + if (_is_store) + TILED_UNALIGNED_TYPES(true, 2) + else + TILED_UNALIGNED_TYPES(false, 2) + } else { + if (_is_store) + TILED_UNALIGNED_TYPES(true, 4) + else + TILED_UNALIGNED_TYPES(false, 4) + } +} - unsigned expanded_y = bit_duplication[y & 0xF]; +#define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8))) - for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { - int block_x_s = (x >> 4) * 256; +void +panfrost_store_tiled_image(void *dst, const void *src, + unsigned x, unsigned y, + unsigned w, unsigned h, + uint32_t dst_stride, + uint32_t src_stride, + enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); - unsigned index = expanded_y ^ space_4[x & 0xF]; + if (desc->block.width > 1) { + panfrost_access_tiled_image_generic(dst, (void *) src, + x, y, w, h, + dst_stride, src_stride, desc, true); - uint8_t *src8 = src; - uint8_t *source = &src8[source_start + bpp * src_x]; - uint8_t *dest = dst + block_start_s + bpp * (block_x_s + index); + return; + } - uint8_t *out = is_store ? dest : source; - uint8_t *in = is_store ? source : dest; + unsigned bpp = desc->block.bits; + unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH; + unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT; + unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH; + unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT; - uint16_t *out16 = (uint16_t *) out; - uint16_t *in16 = (uint16_t *) in; + /* First, tile the top portion */ - uint32_t *out32 = (uint32_t *) out; - uint32_t *in32 = (uint32_t *) in; + unsigned orig_x = x, orig_y = y; - uint64_t *out64 = (uint64_t *) out; - uint64_t *in64 = (uint64_t *) in; + if (first_full_tile_y != y) { + unsigned dist = MIN2(first_full_tile_y - y, h); - /* Write out 1-16 bytes. Written like this rather than a loop so the - * compiler can see what's going on */ + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), + x, y, w, dist, + dst_stride, src_stride, desc, true); - switch (bpp) { - case 1: - out[0] = in[0]; - break; + if (dist == h) + return; - case 2: - out16[0] = in16[0]; - break; + y += dist; + h -= dist; + } - case 3: - out16[0] = in16[0]; - out[2] = in[2]; - break; + /* Next, the bottom portion */ + if (last_full_tile_y != (y + h)) { + unsigned dist = (y + h) - last_full_tile_y; - case 4: - out32[0] = in32[0]; - break; + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y), + x, last_full_tile_y, w, dist, + dst_stride, src_stride, desc, true); - case 6: - out32[0] = in32[0]; - out16[2] = in16[2]; - break; + h -= dist; + } - case 8: - out64[0] = in64[0]; - break; + /* The left portion */ + if (first_full_tile_x != x) { + unsigned dist = MIN2(first_full_tile_x - x, w); - case 12: - out64[0] = in64[0]; - out32[2] = in32[2]; - break; + panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), + x, y, dist, h, + dst_stride, src_stride, desc, true); - case 16: - out64[0] = in64[0]; - out64[1] = in64[1]; - break; + if (dist == w) + return; - default: - assert(0); /* Invalid */ - } - } + x += dist; + w -= dist; } -} -void -panfrost_store_tiled_image(void *dst, const void *src, - unsigned x, unsigned y, - unsigned w, unsigned h, - uint32_t dst_stride, - uint32_t src_stride, - uint32_t bpp) -{ - /* The optimized path is for aligned writes specifically */ - - if (x & 0xF || w & 0xF) { - panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true); - return; - } + /* Finally, the right portion */ + if (last_full_tile_x != (x + w)) { + unsigned dist = (x + w) - last_full_tile_x; - /* Attempt to use an optimized path if we have one */ + panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y), + last_full_tile_x, y, dist, h, + dst_stride, src_stride, desc, true); - switch (bpp) { - case 4: - panfrost_store_tiled_image_bpp4(dst, (void *) src, x, y, w, h, dst_stride, src_stride); - break; - default: - panfrost_access_tiled_image_generic(dst, (void *) src, x, y, w, h, dst_stride, src_stride, bpp, true); - break; + w -= dist; } + + if (bpp == 8) + panfrost_store_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 16) + panfrost_store_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 32) + panfrost_store_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 64) + panfrost_store_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); + else if (bpp == 128) + panfrost_store_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride); } void @@ -311,7 +365,8 @@ panfrost_load_tiled_image(void *dst, const void *src, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp) + enum pipe_format format) { - panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, bpp, false); + const struct util_format_description *desc = util_format_description(format); + panfrost_access_tiled_image_generic((void *) src, dst, x, y, w, h, src_stride, dst_stride, desc, false); } diff --git a/src/panfrost/shared/pan_tiling.h b/src/panfrost/shared/pan_tiling.h index e13d50c41e4..d8591e6dbdd 100644 --- a/src/panfrost/shared/pan_tiling.h +++ b/src/panfrost/shared/pan_tiling.h @@ -28,19 +28,20 @@ #define H_PANFROST_TILING #include <stdint.h> +#include <util/format/u_format.h> void panfrost_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp); + enum pipe_format format); void panfrost_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y, unsigned w, unsigned h, uint32_t dst_stride, uint32_t src_stride, - uint32_t bpp); + enum pipe_format format); #endif |