From fcfb7f573c6f8e70e5213c5c0d9e1566d45eadd7 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 17 Dec 2018 12:54:42 -0800 Subject: v3d: Load and store aligned utiles all at once. This calls the expensive uif offset function once per utile, but it still gets us a 212.218% +/- 2.41216% (n=10) win on 1024x1024 glTexImage over calling it on each pixel. --- src/gallium/drivers/v3d/v3d_tiling.c | 122 ++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 8 deletions(-) (limited to 'src/gallium/drivers/v3d/v3d_tiling.c') diff --git a/src/gallium/drivers/v3d/v3d_tiling.c b/src/gallium/drivers/v3d/v3d_tiling.c index bc227be23ac..9fd801c3e95 100644 --- a/src/gallium/drivers/v3d/v3d_tiling.c +++ b/src/gallium/drivers/v3d/v3d_tiling.c @@ -31,6 +31,7 @@ #include "v3d_screen.h" #include "v3d_context.h" #include "v3d_tiling.h" +#include "broadcom/common/v3d_cpu_tiling.h" /** Return the width in pixels of a 64-byte microtile. */ uint32_t @@ -211,15 +212,19 @@ v3d_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h, return v3d_get_uif_pixel_offset(cpp, image_h, x, y, false); } +/* Loads/stores non-utile-aligned boxes by walking over the destination + * rectangle, computing the address on the GPU, and storing/loading a pixel at + * a time. + */ static inline void -v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride, - void *cpu, uint32_t cpu_stride, - int cpp, uint32_t image_h, - const struct pipe_box *box, - uint32_t (*get_pixel_offset)(uint32_t cpp, - uint32_t image_h, - uint32_t x, uint32_t y), - bool is_load) +v3d_move_pixels_unaligned(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, uint32_t image_h, + const struct pipe_box *box, + uint32_t (*get_pixel_offset)(uint32_t cpp, + uint32_t image_h, + uint32_t x, uint32_t y), + bool is_load) { for (uint32_t y = 0; y < box->height; y++) { void *cpu_row = cpu + y * cpu_stride; @@ -248,6 +253,107 @@ v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride, } } +/* Breaks the image down into utiles and calls either the fast whole-utile + * load/store functions, or the unaligned fallback case. + */ +static inline void +v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride, + int cpp, uint32_t image_h, + const struct pipe_box *box, + uint32_t (*get_pixel_offset)(uint32_t cpp, + uint32_t image_h, + uint32_t x, uint32_t y), + bool is_load) +{ + uint32_t utile_w = v3d_utile_width(cpp); + uint32_t utile_h = v3d_utile_height(cpp); + uint32_t utile_gpu_stride = utile_w * cpp; + uint32_t x1 = box->x; + uint32_t y1 = box->y; + uint32_t x2 = box->x + box->width; + uint32_t y2 = box->y + box->height; + uint32_t align_x1 = align(x1, utile_w); + uint32_t align_y1 = align(y1, utile_h); + uint32_t align_x2 = x2 & ~(utile_w - 1); + uint32_t align_y2 = y2 & ~(utile_h - 1); + + /* Load/store all the whole utiles first. */ + for (uint32_t y = align_y1; y < align_y2; y += utile_h) { + void *cpu_row = cpu + (y - box->y) * cpu_stride; + + for (uint32_t x = align_x1; x < align_x2; x += utile_w) { + void *utile_gpu = (gpu + + get_pixel_offset(cpp, image_h, x, y)); + void *utile_cpu = cpu_row + (x - box->x) * cpp; + + if (is_load) { + v3d_load_utile(utile_cpu, cpu_stride, + utile_gpu, utile_gpu_stride); + } else { + v3d_store_utile(utile_gpu, utile_gpu_stride, + utile_cpu, cpu_stride); + } + } + } + + /* If there were no aligned utiles in the middle, load/store the whole + * thing unaligned. + */ + if (align_y2 <= align_y1 || + align_x2 <= align_x1) { + v3d_move_pixels_unaligned(gpu, gpu_stride, + cpu, cpu_stride, + cpp, image_h, + box, + get_pixel_offset, is_load); + return; + } + + /* Load/store the partial utiles. */ + struct pipe_box partial_boxes[4] = { + /* Top */ + { + .x = x1, + .width = x2 - x1, + .y = y1, + .height = align_y1 - y1, + }, + /* Bottom */ + { + .x = x1, + .width = x2 - x1, + .y = align_y2, + .height = y2 - align_y2, + }, + /* Left */ + { + .x = x1, + .width = align_x1 - x1, + .y = align_y1, + .height = align_y2 - align_y1, + }, + /* Right */ + { + .x = align_x2, + .width = x2 - align_x2, + .y = align_y1, + .height = align_y2 - align_y1, + }, + }; + for (int i = 0; i < ARRAY_SIZE(partial_boxes); i++) { + void *partial_cpu = (cpu + + (partial_boxes[i].y - y1) * cpu_stride + + (partial_boxes[i].x - x1) * cpp); + + v3d_move_pixels_unaligned(gpu, gpu_stride, + partial_cpu, cpu_stride, + cpp, image_h, + &partial_boxes[i], + get_pixel_offset, is_load); + } +} + static inline void v3d_move_pixels_general(void *gpu, uint32_t gpu_stride, void *cpu, uint32_t cpu_stride, -- cgit v1.2.3