diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/drivers/dri/i965/Makefile.am | 7 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/Makefile.sources | 6 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 62 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/meson.build | 18 |
4 files changed, 88 insertions, 5 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index 889d4c68a2b..ff47add93f4 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110 noinst_LTLIBRARIES = \ libi965_dri.la \ + libintel_tiled_memcpy.la \ $(I965_PERGEN_LIBS) +libintel_tiled_memcpy_la_SOURCES = \ + $(intel_tiled_memcpy_FILES) +libintel_tiled_memcpy_la_CFLAGS = \ + $(AM_CFLAGS) $(SSE41_CFLAGS) + libi965_dri_la_SOURCES = \ $(i965_FILES) \ $(i965_oa_GENERATED_FILES) @@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \ $(top_builddir)/src/intel/compiler/libintel_compiler.la \ $(top_builddir)/src/intel/blorp/libblorp.la \ $(I965_PERGEN_LIBS) \ + libintel_tiled_memcpy.la $(LIBDRM_LIBS) BUILT_SOURCES = $(i965_oa_GENERATED_FILES) diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index db6591ab90a..ce7633c53c4 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -110,11 +110,13 @@ i965_FILES = \ intel_tex_image.c \ intel_tex_obj.h \ intel_tex_validate.c \ - intel_tiled_memcpy.c \ - intel_tiled_memcpy.h \ intel_upload.c \ libdrm_macros.h +intel_tiled_memcpy_FILES = \ + intel_tiled_memcpy.c \ + intel_tiled_memcpy.h + i965_gen4_FILES = \ genX_blorp_exec.c \ genX_state_upload.c diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c index 7c6bde990d6..fac5427d2ed 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c @@ -36,6 +36,10 @@ #include "brw_context.h" #include "intel_tiled_memcpy.h" +#if defined(USE_SSE41) +#include "main/streaming-load-memcpy.h" +#include <smmintrin.h> +#endif #if defined(__SSSE3__) #include <tmmintrin.h> #elif defined(__SSE2__) @@ -213,6 +217,31 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) return dst; } +#if defined(USE_SSE41) +static ALWAYS_INLINE void * +_memcpy_streaming_load(void *dest, const void *src, size_t count) +{ + if (count == 16) { + __m128i val = _mm_stream_load_si128((__m128i *)src); + _mm_store_si128((__m128i *)dest, val); + return dest; + } else if (count == 64) { + __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0); + __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1); + __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2); + __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3); + _mm_store_si128(((__m128i *)dest) + 0, val0); + _mm_store_si128(((__m128i *)dest) + 1, val1); + _mm_store_si128(((__m128i *)dest) + 2, val2); + _mm_store_si128(((__m128i *)dest) + 3, val3); + return dest; + } else { + assert(count < 64); /* and (count < 16) for ytiled */ + return memcpy(dest, src, count); + } +} +#endif + /** * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3). * These ranges are in bytes, i.e. pixels * bytes-per-pixel. @@ -677,6 +706,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, dst_pitch, swizzle_bit, rgba8_copy, rgba8_copy_aligned_src); +#if defined(USE_SSE41) + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) + return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, + dst, src, dst_pitch, swizzle_bit, + memcpy, _memcpy_streaming_load); +#endif else unreachable("not reached"); } else { @@ -687,6 +722,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, return xtiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, rgba8_copy, rgba8_copy_aligned_src); +#if defined(USE_SSE41) + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) + return xtiled_to_linear(x0, x1, x2, x3, y0, y1, + dst, src, dst_pitch, swizzle_bit, + memcpy, _memcpy_streaming_load); +#endif else unreachable("not reached"); } @@ -719,6 +760,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, dst_pitch, swizzle_bit, rgba8_copy, rgba8_copy_aligned_src); +#if defined(USE_SSE41) + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) + return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, + dst, src, dst_pitch, swizzle_bit, + memcpy, _memcpy_streaming_load); +#endif else unreachable("not reached"); } else { @@ -729,6 +776,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, return ytiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, rgba8_copy, rgba8_copy_aligned_src); +#if defined(USE_SSE41) + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) + return ytiled_to_linear(x0, x1, x2, x3, y0, y1, + dst, src, dst_pitch, swizzle_bit, + memcpy, _memcpy_streaming_load); +#endif else unreachable("not reached"); } @@ -868,6 +921,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, unreachable("unsupported tiling"); } +#if defined(USE_SSE41) + if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) { + /* The hidden cacheline sized register used by movntdqa can apparently + * give you stale data, so do an mfence to invalidate it. + */ + _mm_mfence(); + } +#endif + /* Round out to tile boundaries. */ xt0 = ALIGN_DOWN(xt1, tw); xt3 = ALIGN_UP (xt2, tw); diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build index 20404d5b059..1eac329f49c 100644 --- a/src/mesa/drivers/dri/i965/meson.build +++ b/src/mesa/drivers/dri/i965/meson.build @@ -129,12 +129,15 @@ files_i965 = files( 'intel_tex_image.c', 'intel_tex_obj.h', 'intel_tex_validate.c', - 'intel_tiled_memcpy.c', - 'intel_tiled_memcpy.h', 'intel_upload.c', 'libdrm_macros.h', ) +files_intel_tiled_memcpy = files( + 'intel_tiled_memcpy.c', + 'intel_tiled_memcpy.h', +) + i965_gen_libs = [] foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110'] i965_gen_libs += static_library( @@ -176,6 +179,15 @@ i965_oa_sources = custom_target( ], ) +intel_tiled_memcpy = static_library( + 'intel_tiled_memcpy', + [files_intel_tiled_memcpy], + include_directories : [ + inc_common, inc_intel, inc_dri_common, inc_drm_uapi, + ], + c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args], +) + libi965 = static_library( 'i965', [files_i965, i965_oa_sources, ir_expression_operation_h, @@ -187,7 +199,7 @@ libi965 = static_library( cpp_args : [cpp_vis_args, '-msse2'], link_with : [ i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler, - libblorp, + libblorp, intel_tiled_memcpy, ], dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers], ) |