diff options
author | Eric Anholt <[email protected]> | 2017-01-08 14:54:57 -0800 |
---|---|---|
committer | Eric Anholt <[email protected]> | 2017-01-26 12:50:05 -0800 |
commit | 9baf1ff8fc06d8c986e55465f77427d416ecd710 (patch) | |
tree | b23674de2e88918037e75de22fdb66768005d098 /src/gallium/drivers/vc4 | |
parent | 4d30024238efa829cabc72c1601beeee18c3dbf2 (diff) |
vc4: Use NEON to speed up utile stores on Pi2+.cros-mesa-17.1.0-r2-vanillacros-mesa-17.1.0-r1-vanillachadv/cros-mesa-17.1.0-r2-vanillachadv/cros-mesa-17.1.0-r1-vanilla
Improves 1024x1024 TexSubImage2D by 41.2371% +/- 3.52799% (n=10).
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r-- | src/gallium/drivers/vc4/vc4_tiling_lt.c | 55 |
1 files changed, 50 insertions, 5 deletions
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index d3a1d7bdd41..237396b1eae 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -114,14 +114,59 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) } static void -vc4_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp) +vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) { - uint32_t dst_stride = vc4_utile_stride(cpp); + uint32_t gpu_stride = vc4_utile_stride(cpp); - for (uint32_t dst_offset = 0; dst_offset < 64; dst_offset += dst_stride) { - memcpy(dst + dst_offset, src, dst_stride); - src += src_stride; +#if defined(VC4_BUILD_NEON) && defined(__ARM_ARCH) + if (gpu_stride == 8) { + __asm__ volatile ( + /* Load each 8-byte line from cpu-side source, + * incrementing it by the stride each time. + */ + "vld1.8 d0, [%1], %r2;\n" + "vld1.8 d1, [%1], %r2;\n" + "vld1.8 d2, [%1], %r2;\n" + "vld1.8 d3, [%1], %r2;\n" + "vld1.8 d4, [%1], %r2;\n" + "vld1.8 d5, [%1], %r2;\n" + "vld1.8 d6, [%1], %r2;\n" + "vld1.8 d7, [%1];\n" + /* Load from the GPU in one shot, no interleave, to + * d0-d7. + */ + "vstm %0, {q0, q1, q2, q3};\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + /* Load each 16-byte line in 2 parts from the cpu-side + * destination. (vld1 can only store one d-register + * at a time). + */ + "vld1.8 d0, [%1], %r3;\n" + "vld1.8 d1, [%2], %r3;\n" + "vld1.8 d2, [%1], %r3;\n" + "vld1.8 d3, [%2], %r3;\n" + "vld1.8 d4, [%1], %r3;\n" + "vld1.8 d5, [%2], %r3;\n" + "vld1.8 d6, [%1];\n" + "vld1.8 d7, [%2];\n" + /* Store to the GPU in one shot, no interleave. */ + "vstm %0, {q0, q1, q2, q3};\n" + : + : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) + : "q0", "q1", "q2", "q3"); } +#else + for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { + memcpy(gpu + gpu_offset, cpu, gpu_stride); + cpu += cpu_stride; + } +#endif + } void |