summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/mesa/drivers/dri/i965/Makefile.am7
-rw-r--r--src/mesa/drivers/dri/i965/Makefile.sources6
-rw-r--r--src/mesa/drivers/dri/i965/intel_tiled_memcpy.c62
-rw-r--r--src/mesa/drivers/dri/i965/meson.build18
4 files changed, 88 insertions, 5 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 889d4c68a2b..ff47add93f4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110
noinst_LTLIBRARIES = \
libi965_dri.la \
+ libintel_tiled_memcpy.la \
$(I965_PERGEN_LIBS)
+libintel_tiled_memcpy_la_SOURCES = \
+ $(intel_tiled_memcpy_FILES)
+libintel_tiled_memcpy_la_CFLAGS = \
+ $(AM_CFLAGS) $(SSE41_CFLAGS)
+
libi965_dri_la_SOURCES = \
$(i965_FILES) \
$(i965_oa_GENERATED_FILES)
@@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \
$(top_builddir)/src/intel/compiler/libintel_compiler.la \
$(top_builddir)/src/intel/blorp/libblorp.la \
$(I965_PERGEN_LIBS) \
+ libintel_tiled_memcpy.la
$(LIBDRM_LIBS)
BUILT_SOURCES = $(i965_oa_GENERATED_FILES)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index db6591ab90a..ce7633c53c4 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,11 +110,13 @@ i965_FILES = \
intel_tex_image.c \
intel_tex_obj.h \
intel_tex_validate.c \
- intel_tiled_memcpy.c \
- intel_tiled_memcpy.h \
intel_upload.c \
libdrm_macros.h
+intel_tiled_memcpy_FILES = \
+ intel_tiled_memcpy.c \
+ intel_tiled_memcpy.h
+
i965_gen4_FILES = \
genX_blorp_exec.c \
genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 7c6bde990d6..fac5427d2ed 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,6 +36,10 @@
#include "brw_context.h"
#include "intel_tiled_memcpy.h"
+#if defined(USE_SSE41)
+#include "main/streaming-load-memcpy.h"
+#include <smmintrin.h>
+#endif
#if defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE2__)
@@ -213,6 +217,31 @@ rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
return dst;
}
+#if defined(USE_SSE41)
+static ALWAYS_INLINE void *
+_memcpy_streaming_load(void *dest, const void *src, size_t count)
+{
+ if (count == 16) {
+ __m128i val = _mm_stream_load_si128((__m128i *)src);
+ _mm_store_si128((__m128i *)dest, val);
+ return dest;
+ } else if (count == 64) {
+ __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
+ __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
+ __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
+ __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
+ _mm_store_si128(((__m128i *)dest) + 0, val0);
+ _mm_store_si128(((__m128i *)dest) + 1, val1);
+ _mm_store_si128(((__m128i *)dest) + 2, val2);
+ _mm_store_si128(((__m128i *)dest) + 3, val3);
+ return dest;
+ } else {
+ assert(count < 64); /* and (count < 16) for ytiled */
+ return memcpy(dest, src, count);
+ }
+}
+#endif
+
/**
* Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
* These ranges are in bytes, i.e. pixels * bytes-per-pixel.
@@ -677,6 +706,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
@@ -687,6 +722,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
@@ -719,6 +760,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
} else {
@@ -729,6 +776,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
dst, src, dst_pitch, swizzle_bit,
rgba8_copy, rgba8_copy_aligned_src);
+#if defined(USE_SSE41)
+ else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy)
+ return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
+ dst, src, dst_pitch, swizzle_bit,
+ memcpy, _memcpy_streaming_load);
+#endif
else
unreachable("not reached");
}
@@ -868,6 +921,15 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
unreachable("unsupported tiling");
}
+#if defined(USE_SSE41)
+ if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) {
+ /* The hidden cacheline sized register used by movntdqa can apparently
+ * give you stale data, so do an mfence to invalidate it.
+ */
+ _mm_mfence();
+ }
+#endif
+
/* Round out to tile boundaries. */
xt0 = ALIGN_DOWN(xt1, tw);
xt3 = ALIGN_UP (xt2, tw);
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index 20404d5b059..1eac329f49c 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -129,12 +129,15 @@ files_i965 = files(
'intel_tex_image.c',
'intel_tex_obj.h',
'intel_tex_validate.c',
- 'intel_tiled_memcpy.c',
- 'intel_tiled_memcpy.h',
'intel_upload.c',
'libdrm_macros.h',
)
+files_intel_tiled_memcpy = files(
+ 'intel_tiled_memcpy.c',
+ 'intel_tiled_memcpy.h',
+)
+
i965_gen_libs = []
foreach v : ['40', '45', '50', '60', '70', '75', '80', '90', '100', '110']
i965_gen_libs += static_library(
@@ -176,6 +179,15 @@ i965_oa_sources = custom_target(
],
)
+intel_tiled_memcpy = static_library(
+ 'intel_tiled_memcpy',
+ [files_intel_tiled_memcpy],
+ include_directories : [
+ inc_common, inc_intel, inc_dri_common, inc_drm_uapi,
+ ],
+ c_args : [c_vis_args, no_override_init_args, '-msse2', sse41_args],
+)
+
libi965 = static_library(
'i965',
[files_i965, i965_oa_sources, ir_expression_operation_h,
@@ -187,7 +199,7 @@ libi965 = static_library(
cpp_args : [cpp_vis_args, '-msse2'],
link_with : [
i965_gen_libs, libintel_common, libintel_dev, libisl, libintel_compiler,
- libblorp,
+ libblorp, intel_tiled_memcpy,
],
dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
)