From f6a0f4f41e9a4d3f0ea9c3bf82497c384e684843 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 17 Dec 2018 11:10:11 -0800 Subject: vc4: Move the utile load/store functions to a header for reuse by v3d. These implementations of whole-utile load/stores would be the same for v3d, though the layouts of blocks of utiles has changed. --- src/gallium/drivers/vc4/vc4_tiling_lt.c | 211 ++------------------------- src/gallium/drivers/vc4/vc4_tiling_lt_neon.c | 2 +- 2 files changed, 11 insertions(+), 202 deletions(-) (limited to 'src/gallium/drivers/vc4') diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index ec42a3dc2f7..d2a84bb3540 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -26,7 +26,7 @@ * Helper functions from vc4_tiling.c that will be compiled for using NEON * assembly or not. * - * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon. + * If V3D_BUILD_NEON is set, then the functions will be suffixed with _neon. * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86 * sim build working. */ @@ -34,8 +34,9 @@ #include #include "pipe/p_state.h" #include "vc4_tiling.h" +#include "broadcom/common/v3d_cpu_tiling.h" -#ifdef VC4_BUILD_NEON +#ifdef V3D_BUILD_NEON #define NEON_TAG(x) x ## _neon #else #define NEON_TAG(x) x ## _base @@ -63,201 +64,6 @@ vc4_utile_stride(int cpp) } } -static void -vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %0, {q0, q1, q2, q3}\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "vst1.8 d0, [%1], %2\n" - "vst1.8 d1, [%1], %2\n" - "vst1.8 d2, [%1], %2\n" - "vst1.8 d3, [%1], %2\n" - "vst1.8 d4, [%1], %2\n" - "vst1.8 d5, [%1], %2\n" - "vst1.8 d6, [%1], %2\n" - "vst1.8 d7, [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %0, {q0, q1, q2, q3};\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vst1.8 d0, [%1], %3\n" - "vst1.8 d1, [%2], %3\n" - "vst1.8 d2, [%1], %3\n" - "vst1.8 d3, [%2], %3\n" - "vst1.8 d4, [%1], %3\n" - "vst1.8 d5, [%2], %3\n" - "vst1.8 d6, [%1]\n" - "vst1.8 d7, [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "st1 {v0.D}[0], [%1], %2\n" - "st1 {v0.D}[1], [%1], %2\n" - "st1 {v1.D}[0], [%1], %2\n" - "st1 {v1.D}[1], [%1], %2\n" - "st1 {v2.D}[0], [%1], %2\n" - "st1 {v2.D}[1], [%1], %2\n" - "st1 {v3.D}[0], [%1], %2\n" - "st1 {v3.D}[1], [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "st1 {v0.D}[0], [%1], %3\n" - "st1 {v0.D}[1], [%2], %3\n" - "st1 {v1.D}[0], [%1], %3\n" - "st1 {v1.D}[1], [%2], %3\n" - "st1 {v2.D}[0], [%1], %3\n" - "st1 {v2.D}[1], [%2], %3\n" - "st1 {v3.D}[0], [%1]\n" - "st1 {v3.D}[1], [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(cpu, gpu + gpu_offset, gpu_stride); - cpu += cpu_stride; - } -#endif -} - -static void -vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); - -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "vld1.8 d0, [%1], %2\n" - "vld1.8 d1, [%1], %2\n" - "vld1.8 d2, [%1], %2\n" - "vld1.8 d3, [%1], %2\n" - "vld1.8 d4, [%1], %2\n" - "vld1.8 d5, [%1], %2\n" - "vld1.8 d6, [%1], %2\n" - "vld1.8 d7, [%1]\n" - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vld1.8 d0, [%1], %3\n" - "vld1.8 d1, [%2], %3\n" - "vld1.8 d2, [%1], %3\n" - "vld1.8 d3, [%2], %3\n" - "vld1.8 d4, [%1], %3\n" - "vld1.8 d5, [%2], %3\n" - "vld1.8 d6, [%1]\n" - "vld1.8 d7, [%2]\n" - /* Store to the GPU in one shot, no interleave. */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "ld1 {v0.D}[0], [%1], %2\n" - "ld1 {v0.D}[1], [%1], %2\n" - "ld1 {v1.D}[0], [%1], %2\n" - "ld1 {v1.D}[1], [%1], %2\n" - "ld1 {v2.D}[0], [%1], %2\n" - "ld1 {v2.D}[1], [%1], %2\n" - "ld1 {v3.D}[0], [%1], %2\n" - "ld1 {v3.D}[1], [%1]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "ld1 {v0.D}[0], [%1], %3\n" - "ld1 {v0.D}[1], [%2], %3\n" - "ld1 {v1.D}[0], [%1], %3\n" - "ld1 {v1.D}[1], [%2], %3\n" - "ld1 {v2.D}[0], [%1], %3\n" - "ld1 {v2.D}[1], [%2], %3\n" - "ld1 {v3.D}[0], [%1]\n" - "ld1 {v3.D}[1], [%2]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(gpu + gpu_offset, cpu, gpu_stride); - cpu += cpu_stride; - } -#endif - -} /** * Returns the X value into the address bits for LT tiling. * @@ -333,6 +139,7 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); + uint32_t utile_stride = vc4_utile_stride(cpp); uint32_t xstart = box->x; uint32_t ystart = box->y; @@ -341,15 +148,17 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, void *gpu_tile = gpu + ((ystart + y) * gpu_stride + (xstart + x) * 64 / utile_w); if (to_cpu) { - vc4_load_utile(cpu + (cpu_stride * y + + v3d_load_utile(cpu + (cpu_stride * y + x * cpp), + cpu_stride, gpu_tile, - cpu_stride, cpp); + utile_stride); } else { - vc4_store_utile(gpu_tile, + v3d_store_utile(gpu_tile, + utile_stride, cpu + (cpu_stride * y + x * cpp), - cpu_stride, cpp); + cpu_stride); } } } diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c index 7ba66ae4cdf..9efec379933 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c @@ -26,5 +26,5 @@ * single file. */ -#define VC4_BUILD_NEON +#define V3D_BUILD_NEON #include "vc4_tiling_lt.c" -- cgit v1.2.3