diff options
-rw-r--r-- | src/broadcom/Makefile.sources | 1 | ||||
-rw-r--r-- | src/broadcom/common/v3d_cpu_tiling.h | 222 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_tiling_lt.c | 211 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_tiling_lt_neon.c | 2 |
4 files changed, 234 insertions, 202 deletions
diff --git a/src/broadcom/Makefile.sources b/src/broadcom/Makefile.sources index 5955acdefd5..f535447b476 100644 --- a/src/broadcom/Makefile.sources +++ b/src/broadcom/Makefile.sources @@ -17,6 +17,7 @@ BROADCOM_FILES = \ clif/clif_dump.c \ clif/clif_dump.h \ clif/clif_private.h \ + common/v3d_cpu_tiling.h \ common/v3d_debug.c \ common/v3d_debug.h \ common/v3d_device_info.h \ diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h new file mode 100644 index 00000000000..15678f9e925 --- /dev/null +++ b/src/broadcom/common/v3d_cpu_tiling.h @@ -0,0 +1,222 @@ +/* + * Copyright © 2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file v3d_cpu_tiling.h + * + * Contains load/store functions common to both v3d and vc4. The utile layout + * stayed the same, though the way utiles get laid out has changed. + */ + +static inline void +v3d_load_utile(void *cpu, uint32_t cpu_stride, + void *gpu, uint32_t gpu_stride) +{ +#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vldm %0, {q0, q1, q2, q3}\n" + /* Store each 8-byte line to cpu-side destination, + * incrementing it by the stride each time. + */ + "vst1.8 d0, [%1], %2\n" + "vst1.8 d1, [%1], %2\n" + "vst1.8 d2, [%1], %2\n" + "vst1.8 d3, [%1], %2\n" + "vst1.8 d4, [%1], %2\n" + "vst1.8 d5, [%1], %2\n" + "vst1.8 d6, [%1], %2\n" + "vst1.8 d7, [%1]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vldm %0, {q0, q1, q2, q3};\n" + /* Store each 16-byte line in 2 parts to the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "vst1.8 d0, [%1], %3\n" + "vst1.8 d1, [%2], %3\n" + "vst1.8 d2, [%1], %3\n" + "vst1.8 d3, [%2], %3\n" + "vst1.8 d4, [%1], %3\n" + "vst1.8 d5, [%2], %3\n" + "vst1.8 d6, [%1]\n" + "vst1.8 d7, [%2]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } +#elif defined (PIPE_ARCH_AARCH64) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + /* Store each 8-byte line to cpu-side destination, + * incrementing it by the stride each time. + */ + "st1 {v0.D}[0], [%1], %2\n" + "st1 {v0.D}[1], [%1], %2\n" + "st1 {v1.D}[0], [%1], %2\n" + "st1 {v1.D}[1], [%1], %2\n" + "st1 {v2.D}[0], [%1], %2\n" + "st1 {v2.D}[1], [%1], %2\n" + "st1 {v3.D}[0], [%1], %2\n" + "st1 {v3.D}[1], [%1]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "v0", "v1", "v2", "v3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + /* Store each 16-byte line in 2 parts to the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "st1 {v0.D}[0], [%1], %3\n" + "st1 {v0.D}[1], [%2], %3\n" + "st1 {v1.D}[0], [%1], %3\n" + "st1 {v1.D}[1], [%2], %3\n" + "st1 {v2.D}[0], [%1], %3\n" + "st1 {v2.D}[1], [%2], %3\n" + "st1 {v3.D}[0], [%1]\n" + "st1 {v3.D}[1], [%2]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "v0", "v1", "v2", "v3"); + } +#else + for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { + memcpy(cpu, gpu + gpu_offset, gpu_stride); + cpu += cpu_stride; + } +#endif +} + +static inline void +v3d_store_utile(void *gpu, uint32_t gpu_stride, + void *cpu, uint32_t cpu_stride) +{ +#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load each 8-byte line from cpu-side source, + * incrementing it by the stride each time. + */ + "vld1.8 d0, [%1], %2\n" + "vld1.8 d1, [%1], %2\n" + "vld1.8 d2, [%1], %2\n" + "vld1.8 d3, [%1], %2\n" + "vld1.8 d4, [%1], %2\n" + "vld1.8 d5, [%1], %2\n" + "vld1.8 d6, [%1], %2\n" + "vld1.8 d7, [%1]\n" + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vstm %0, {q0, q1, q2, q3}\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load each 16-byte line in 2 parts from the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "vld1.8 d0, [%1], %3\n" + "vld1.8 d1, [%2], %3\n" + "vld1.8 d2, [%1], %3\n" + "vld1.8 d3, [%2], %3\n" + "vld1.8 d4, [%1], %3\n" + "vld1.8 d5, [%2], %3\n" + "vld1.8 d6, [%1]\n" + "vld1.8 d7, [%2]\n" + /* Store to the GPU in one shot, no interleave. */ + "vstm %0, {q0, q1, q2, q3}\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } +#elif defined (PIPE_ARCH_AARCH64) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load each 8-byte line from cpu-side source, + * incrementing it by the stride each time. + */ + "ld1 {v0.D}[0], [%1], %2\n" + "ld1 {v0.D}[1], [%1], %2\n" + "ld1 {v1.D}[0], [%1], %2\n" + "ld1 {v1.D}[1], [%1], %2\n" + "ld1 {v2.D}[0], [%1], %2\n" + "ld1 {v2.D}[1], [%1], %2\n" + "ld1 {v3.D}[0], [%1], %2\n" + "ld1 {v3.D}[1], [%1]\n" + /* Store to the GPU in one shot, no interleave. */ + "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "v0", "v1", "v2", "v3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load each 16-byte line in 2 parts from the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "ld1 {v0.D}[0], [%1], %3\n" + "ld1 {v0.D}[1], [%2], %3\n" + "ld1 {v1.D}[0], [%1], %3\n" + "ld1 {v1.D}[1], [%2], %3\n" + "ld1 {v2.D}[0], [%1], %3\n" + "ld1 {v2.D}[1], [%2], %3\n" + "ld1 {v3.D}[0], [%1]\n" + "ld1 {v3.D}[1], [%2]\n" + /* Store to the GPU in one shot, no interleave. */ + "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "v0", "v1", "v2", "v3"); + } +#else + for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { + memcpy(gpu + gpu_offset, cpu, gpu_stride); + cpu += cpu_stride; + } +#endif +} diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index ec42a3dc2f7..d2a84bb3540 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -26,7 +26,7 @@ * Helper functions from vc4_tiling.c that will be compiled for using NEON * assembly or not. * - * If VC4_BUILD_NEON is set, then the functions will be suffixed with _neon. + * If V3D_BUILD_NEON is set, then the functions will be suffixed with _neon. * They will only use NEON assembly if __ARM_ARCH is also set, to keep the x86 * sim build working. */ @@ -34,8 +34,9 @@ #include <string.h> #include "pipe/p_state.h" #include "vc4_tiling.h" +#include "broadcom/common/v3d_cpu_tiling.h" -#ifdef VC4_BUILD_NEON +#ifdef V3D_BUILD_NEON #define NEON_TAG(x) x ## _neon #else #define NEON_TAG(x) x ## _base @@ -63,201 +64,6 @@ vc4_utile_stride(int cpp) } } -static void -vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %0, {q0, q1, q2, q3}\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "vst1.8 d0, [%1], %2\n" - "vst1.8 d1, [%1], %2\n" - "vst1.8 d2, [%1], %2\n" - "vst1.8 d3, [%1], %2\n" - "vst1.8 d4, [%1], %2\n" - "vst1.8 d5, [%1], %2\n" - "vst1.8 d6, [%1], %2\n" - "vst1.8 d7, [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vldm %0, {q0, q1, q2, q3};\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vst1.8 d0, [%1], %3\n" - "vst1.8 d1, [%2], %3\n" - "vst1.8 d2, [%1], %3\n" - "vst1.8 d3, [%2], %3\n" - "vst1.8 d4, [%1], %3\n" - "vst1.8 d5, [%2], %3\n" - "vst1.8 d6, [%1]\n" - "vst1.8 d7, [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - /* Store each 8-byte line to cpu-side destination, - * incrementing it by the stride each time. - */ - "st1 {v0.D}[0], [%1], %2\n" - "st1 {v0.D}[1], [%1], %2\n" - "st1 {v1.D}[0], [%1], %2\n" - "st1 {v1.D}[1], [%1], %2\n" - "st1 {v2.D}[0], [%1], %2\n" - "st1 {v2.D}[1], [%1], %2\n" - "st1 {v3.D}[0], [%1], %2\n" - "st1 {v3.D}[1], [%1]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - /* Store each 16-byte line in 2 parts to the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "st1 {v0.D}[0], [%1], %3\n" - "st1 {v0.D}[1], [%2], %3\n" - "st1 {v1.D}[0], [%1], %3\n" - "st1 {v1.D}[1], [%2], %3\n" - "st1 {v2.D}[0], [%1], %3\n" - "st1 {v2.D}[1], [%2], %3\n" - "st1 {v3.D}[0], [%1]\n" - "st1 {v3.D}[1], [%2]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(cpu, gpu + gpu_offset, gpu_stride); - cpu += cpu_stride; - } -#endif -} - -static void -vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) -{ - uint32_t gpu_stride = vc4_utile_stride(cpp); - -#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "vld1.8 d0, [%1], %2\n" - "vld1.8 d1, [%1], %2\n" - "vld1.8 d2, [%1], %2\n" - "vld1.8 d3, [%1], %2\n" - "vld1.8 d4, [%1], %2\n" - "vld1.8 d5, [%1], %2\n" - "vld1.8 d6, [%1], %2\n" - "vld1.8 d7, [%1]\n" - /* Load from the GPU in one shot, no interleave, to - * d0-d7. - */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "vld1.8 d0, [%1], %3\n" - "vld1.8 d1, [%2], %3\n" - "vld1.8 d2, [%1], %3\n" - "vld1.8 d3, [%2], %3\n" - "vld1.8 d4, [%1], %3\n" - "vld1.8 d5, [%2], %3\n" - "vld1.8 d6, [%1]\n" - "vld1.8 d7, [%2]\n" - /* Store to the GPU in one shot, no interleave. */ - "vstm %0, {q0, q1, q2, q3}\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "q0", "q1", "q2", "q3"); - } -#elif defined (PIPE_ARCH_AARCH64) - if (gpu_stride == 8) { - __asm__ volatile ( - /* Load each 8-byte line from cpu-side source, - * incrementing it by the stride each time. - */ - "ld1 {v0.D}[0], [%1], %2\n" - "ld1 {v0.D}[1], [%1], %2\n" - "ld1 {v1.D}[0], [%1], %2\n" - "ld1 {v1.D}[1], [%1], %2\n" - "ld1 {v2.D}[0], [%1], %2\n" - "ld1 {v2.D}[1], [%1], %2\n" - "ld1 {v3.D}[0], [%1], %2\n" - "ld1 {v3.D}[1], [%1]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } else { - assert(gpu_stride == 16); - __asm__ volatile ( - /* Load each 16-byte line in 2 parts from the cpu-side - * destination. (vld1 can only store one d-register - * at a time). - */ - "ld1 {v0.D}[0], [%1], %3\n" - "ld1 {v0.D}[1], [%2], %3\n" - "ld1 {v1.D}[0], [%1], %3\n" - "ld1 {v1.D}[1], [%2], %3\n" - "ld1 {v2.D}[0], [%1], %3\n" - "ld1 {v2.D}[1], [%2], %3\n" - "ld1 {v3.D}[0], [%1]\n" - "ld1 {v3.D}[1], [%2]\n" - /* Store to the GPU in one shot, no interleave. */ - "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n" - : - : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) - : "v0", "v1", "v2", "v3"); - } -#else - for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { - memcpy(gpu + gpu_offset, cpu, gpu_stride); - cpu += cpu_stride; - } -#endif - -} /** * Returns the X value into the address bits for LT tiling. * @@ -333,6 +139,7 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, { uint32_t utile_w = vc4_utile_width(cpp); uint32_t utile_h = vc4_utile_height(cpp); + uint32_t utile_stride = vc4_utile_stride(cpp); uint32_t xstart = box->x; uint32_t ystart = box->y; @@ -341,15 +148,17 @@ vc4_lt_image_aligned(void *gpu, uint32_t gpu_stride, void *gpu_tile = gpu + ((ystart + y) * gpu_stride + (xstart + x) * 64 / utile_w); if (to_cpu) { - vc4_load_utile(cpu + (cpu_stride * y + + v3d_load_utile(cpu + (cpu_stride * y + x * cpp), + cpu_stride, gpu_tile, - cpu_stride, cpp); + utile_stride); } else { - vc4_store_utile(gpu_tile, + v3d_store_utile(gpu_tile, + utile_stride, cpu + (cpu_stride * y + x * cpp), - cpu_stride, cpp); + cpu_stride); } } } diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c index 7ba66ae4cdf..9efec379933 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c @@ -26,5 +26,5 @@ * single file. */ -#define VC4_BUILD_NEON +#define V3D_BUILD_NEON #include "vc4_tiling_lt.c" |