summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4/vc4_tiling.h
diff options
context:
space:
mode:
authorEric Anholt <[email protected]>2017-01-05 15:11:30 -0800
committerEric Anholt <[email protected]>2017-01-26 12:48:10 -0800
commit4d30024238efa829cabc72c1601beeee18c3dbf2 (patch)
tree09f04f006eb015b3cc5940eddde461519114f77a /src/gallium/drivers/vc4/vc4_tiling.h
parent347b69e7d74f61f3b08853ccdfad72bdae683e12 (diff)
vc4: Use NEON to speed up utile loads on Pi2.
We had a lot of memcpy call overhead because gpu_stride wasn't being inlined. But if you split out the stride==8 and stride==16 cases like this code does while still using memcpy, you'd no longer have glibc's NEON memcpy applied at which point we'd be doing 16 uncached reads instead of 64/(NEON memcpy granularity), for about a 30% performance hit. By hand writing the assembly, we can get a whole cacheline loaded at a time. Unfortunately, NEON intrinsics turned out to be unusable -- they didn't have the vldm instruction available. Note that, for now, the NEON code is only enabled when building for ARMv7 (Pi 2+). We may want to do runtime detection for the Raspbian case, in the future. Improves 1024x1024 GetTexImage by 208.256% +/- 7.07029% (n=10).
Diffstat (limited to 'src/gallium/drivers/vc4/vc4_tiling.h')
-rw-r--r--src/gallium/drivers/vc4/vc4_tiling.h48
1 files changed, 42 insertions, 6 deletions
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index ec66cf9476a..218130b2007 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -62,12 +62,18 @@ vc4_utile_height(int cpp)
}
bool vc4_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
-void vc4_load_lt_image(void *dst, uint32_t dst_stride,
- void *src, uint32_t src_stride,
- int cpp, const struct pipe_box *box);
-void vc4_store_lt_image(void *dst, uint32_t dst_stride,
- void *src, uint32_t src_stride,
- int cpp, const struct pipe_box *box);
+void vc4_load_lt_image_base(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_base(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_load_lt_image_neon(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_neon(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box);
void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
void *src, uint32_t src_stride,
uint8_t tiling_format, int cpp,
@@ -77,4 +83,34 @@ void vc4_store_tiled_image(void *dst, uint32_t dst_stride,
uint8_t tiling_format, int cpp,
const struct pipe_box *box);
+/* If we're building for ARMv7 (Pi 2+), assume it has NEON. For Raspbian we
+ * should extend this to have some runtime detection of being built for ARMv6
+ * on a Pi 2+.
+ */
+#if defined(__ARM_ARCH) && __ARM_ARCH >= 7
+#define NEON_SUFFIX(x) x ## _neon
+#else
+#define NEON_SUFFIX(x) x ## _base
+#endif
+
+static inline void
+vc4_load_lt_image(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride,
+ cpp, box);
+}
+
+static inline void
+vc4_store_lt_image(void *dst, uint32_t dst_stride,
+ void *src, uint32_t src_stride,
+ int cpp, const struct pipe_box *box)
+{
+ NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride,
+ cpp, box);
+}
+
+#undef NEON_SUFFIX
+
#endif /* VC4_TILING_H */