summaryrefslogtreecommitdiffstats
path: root/src/gallium/auxiliary/util
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/util')
-rw-r--r--src/gallium/auxiliary/util/u_dl.c3
-rw-r--r--src/gallium/auxiliary/util/u_format.csv3
-rw-r--r--src/gallium/auxiliary/util/u_format_zs.c53
-rw-r--r--src/gallium/auxiliary/util/u_format_zs.h16
-rw-r--r--src/gallium/auxiliary/util/u_math.h5
-rw-r--r--src/gallium/auxiliary/util/u_pack_color.h50
-rw-r--r--src/gallium/auxiliary/util/u_sse.h154
-rw-r--r--src/gallium/auxiliary/util/u_tile.c84
8 files changed, 344 insertions, 24 deletions
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
index 220860ebf4b..aca435d6cad 100644
--- a/src/gallium/auxiliary/util/u_dl.c
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -38,6 +38,7 @@
#endif
#include "u_dl.h"
+#include "u_pointer.h"
struct util_dl_library *
@@ -58,7 +59,7 @@ util_dl_get_proc_address(struct util_dl_library *library,
const char *procname)
{
#if defined(PIPE_OS_UNIX)
- return (util_dl_proc)dlsym((void *)library, procname);
+ return (util_dl_proc) pointer_to_func(dlsym((void *)library, procname));
#elif defined(PIPE_OS_WINDOWS)
return (util_dl_proc)GetProcAddress((HMODULE)library, procname);
#else
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 0811280b97b..8e5d4487a67 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -109,9 +109,12 @@ PIPE_FORMAT_Z32_UNORM , plain, 1, 1, un32, , , , x___,
PIPE_FORMAT_Z32_FLOAT , plain, 1, 1, f32 , , , , x___, zs
PIPE_FORMAT_Z24_UNORM_S8_USCALED , plain, 1, 1, un24, u8 , , , xy__, zs
PIPE_FORMAT_S8_USCALED_Z24_UNORM , plain, 1, 1, u8 , un24, , , yx__, zs
+PIPE_FORMAT_X24S8_USCALED , plain, 1, 1, x24, u8 , , , _y__, zs
+PIPE_FORMAT_S8X24_USCALED , plain, 1, 1, u8 , x24 , , , _x__, zs
PIPE_FORMAT_Z24X8_UNORM , plain, 1, 1, un24, x8 , , , x___, zs
PIPE_FORMAT_X8Z24_UNORM , plain, 1, 1, x8 , un24, , , y___, zs
PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32, u8 , x24 , , xy__, zs
+PIPE_FORMAT_X32_S8X24_USCALED , plain, 1, 1, x32, u8 , x24 , , _y__, zs
# YUV formats
# http://www.fourcc.org/yuv.php#UYVY
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 792d69c214c..80081e22f7c 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -918,3 +918,56 @@ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned d
}
}
+
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+ util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+}
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+ util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+}
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+ util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+}
+
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+ util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+}
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+ const uint8_t *src_row, unsigned src_stride,
+ unsigned width, unsigned height)
+{
+ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+
+}
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+ const uint8_t *src_row, unsigned src_stride,
+ unsigned width, unsigned height)
+{
+ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+ src_row, src_stride,
+ width, height);
+}
diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h
index 650db4b95fd..1604cc3eee2 100644
--- a/src/gallium/auxiliary/util/u_format_zs.h
+++ b/src/gallium/auxiliary/util/u_format_zs.h
@@ -192,5 +192,21 @@ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned
void
util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
#endif /* U_FORMAT_ZS_H_ */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 69a76814945..37294b7203f 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -118,6 +118,11 @@ __inline double __cdecl atan2(double val)
#endif
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880
+#endif
+
+
#if defined(_MSC_VER)
#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index c90b0fdbc3f..5378f2d782f 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -434,8 +434,8 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
/* Integer versions of util_pack_z and util_pack_z_stencil - useful for
* constructing clear masks.
*/
-static INLINE uint
-util_pack_uint_z(enum pipe_format format, unsigned z)
+static INLINE uint32_t
+util_pack_mask_z(enum pipe_format format, uint32_t z)
{
switch (format) {
case PIPE_FORMAT_Z16_UNORM:
@@ -452,29 +452,32 @@ util_pack_uint_z(enum pipe_format format, unsigned z)
case PIPE_FORMAT_S8_USCALED:
return 0;
default:
- debug_print_format("gallium: unhandled format in util_pack_z()", format);
+ debug_print_format("gallium: unhandled format in util_pack_mask_z()", format);
assert(0);
return 0;
}
}
-static INLINE uint
-util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
{
- unsigned packed = util_pack_uint_z(format, z);
-
- s &= 0xff;
+ uint32_t packed = util_pack_mask_z(format, z);
switch (format) {
case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
- return packed | (s << 24);
+ packed |= (uint32_t)s << 24;
+ break;
case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
- return packed | s;
+ packed |= s;
+ break;
case PIPE_FORMAT_S8_USCALED:
- return packed | s;
+ packed |= s;
+ break;
default:
- return packed;
+ break;
}
+
+ return packed;
}
@@ -482,9 +485,11 @@ util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
/**
* Note: it's assumed that z is in [0,1]
*/
-static INLINE uint
+static INLINE uint32_t
util_pack_z(enum pipe_format format, double z)
{
+ union fi fui;
+
if (z == 0.0)
return 0;
@@ -492,24 +497,25 @@ util_pack_z(enum pipe_format format, double z)
case PIPE_FORMAT_Z16_UNORM:
if (z == 1.0)
return 0xffff;
- return (uint) (z * 0xffff);
+ return (uint32_t) (z * 0xffff);
case PIPE_FORMAT_Z32_UNORM:
/* special-case to avoid overflow */
if (z == 1.0)
return 0xffffffff;
- return (uint) (z * 0xffffffff);
+ return (uint32_t) (z * 0xffffffff);
case PIPE_FORMAT_Z32_FLOAT:
- return (uint)z;
+ fui.f = (float)z;
+ return fui.ui;
case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
case PIPE_FORMAT_Z24X8_UNORM:
if (z == 1.0)
return 0xffffff;
- return (uint) (z * 0xffffff);
+ return (uint32_t) (z * 0xffffff);
case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
case PIPE_FORMAT_X8Z24_UNORM:
if (z == 1.0)
return 0xffffff00;
- return ((uint) (z * 0xffffff)) << 8;
+ return ((uint32_t) (z * 0xffffff)) << 8;
case PIPE_FORMAT_S8_USCALED:
/* this case can get it via util_pack_z_stencil() */
return 0;
@@ -525,14 +531,14 @@ util_pack_z(enum pipe_format format, double z)
* Pack Z and/or stencil values into a 32-bit value described by format.
* Note: it's assumed that z is in [0,1] and s in [0,255]
*/
-static INLINE uint
-util_pack_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
{
- unsigned packed = util_pack_z(format, z);
+ uint32_t packed = util_pack_z(format, z);
switch (format) {
case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
- packed |= s << 24;
+ packed |= (uint32_t)s << 24;
break;
case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
packed |= s;
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 03198c91da4..1df6c872677 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,96 @@ _mm_castps_si128(__m128 a)
#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+union m128i {
+ __m128i m;
+ ubyte ub[16];
+ ushort us[8];
+ uint ui[4];
+};
+
+static INLINE void u_print_epi8(const char *name, __m128i r)
+{
+ union { __m128i m; ubyte ub[16]; } u;
+ u.m = r;
+
+ debug_printf("%s: "
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x/"
+ "%02x\n",
+ name,
+ u.ub[0], u.ub[1], u.ub[2], u.ub[3],
+ u.ub[4], u.ub[5], u.ub[6], u.ub[7],
+ u.ub[8], u.ub[9], u.ub[10], u.ub[11],
+ u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void u_print_epi16(const char *name, __m128i r)
+{
+ union { __m128i m; ushort us[8]; } u;
+ u.m = r;
+
+ debug_printf("%s: "
+ "%04x/"
+ "%04x/"
+ "%04x/"
+ "%04x/"
+ "%04x/"
+ "%04x/"
+ "%04x/"
+ "%04x\n",
+ name,
+ u.us[0], u.us[1], u.us[2], u.us[3],
+ u.us[4], u.us[5], u.us[6], u.us[7]);
+}
+
+static INLINE void u_print_epi32(const char *name, __m128i r)
+{
+ union { __m128i m; uint ui[4]; } u;
+ u.m = r;
+
+ debug_printf("%s: "
+ "%08x/"
+ "%08x/"
+ "%08x/"
+ "%08x\n",
+ name,
+ u.ui[0], u.ui[1], u.ui[2], u.ui[3]);
+}
+
+static INLINE void u_print_ps(const char *name, __m128 r)
+{
+ union { __m128 m; float f[4]; } u;
+ u.m = r;
+
+ debug_printf("%s: "
+ "%f/"
+ "%f/"
+ "%f/"
+ "%f\n",
+ name,
+ u.f[0], u.f[1], u.f[2], u.f[3]);
+}
+
+
+#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
+#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
+#define U_DUMP_EPI8(a) u_print_epi8(#a, a)
+#define U_DUMP_PS(a) u_print_ps(#a, a)
+
+
#if defined(PIPE_ARCH_SSSE3)
@@ -98,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
#endif /* !PIPE_ARCH_SSSE3 */
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+ __m128i a4 = _mm_srli_epi64(a, 32); /* shift by one dword */
+ __m128i b4 = _mm_srli_epi64(b, 32); /* shift by one dword */
+ __m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */
+ __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+ /* Interleave the results, either with shuffles or (slightly
+ * faster) direct bit operations:
+ */
+#if 0
+ __m128i ba8 = _mm_shuffle_epi32(ba, 8);
+ __m128i b4a48 = _mm_shuffle_epi32(b4a4, 8);
+ __m128i result = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+ __m128i mask = _mm_setr_epi32(~0,0,~0,0);
+ __m128i ba_mask = _mm_and_si128(ba, mask);
+ __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
+ __m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+ return result;
+}
+
+
+static INLINE void
+transpose4_epi32(const __m128i * restrict a,
+ const __m128i * restrict b,
+ const __m128i * restrict c,
+ const __m128i * restrict d,
+ __m128i * restrict o,
+ __m128i * restrict p,
+ __m128i * restrict q,
+ __m128i * restrict r)
+{
+ __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+ __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+ __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+ __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+ *o = _mm_unpacklo_epi64(t0, t1);
+ *p = _mm_unpackhi_epi64(t0, t1);
+ *q = _mm_unpacklo_epi64(t2, t3);
+ *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+
+#endif /* PIPE_ARCH_SSE */
#endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index f7aa1403d08..44cadbfcdd0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -217,6 +217,81 @@ z24s8_get_tile_rgba(const unsigned *src,
}
}
+/*** PIPE_FORMAT_S8X24_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255]. Z part ignored.
+ */
+static void
+s8x24_get_tile_rgba(const unsigned *src,
+ unsigned w, unsigned h,
+ float *p,
+ unsigned dst_stride)
+{
+ unsigned i, j;
+
+ for (i = 0; i < h; i++) {
+ float *pRow = p;
+
+ for (j = 0; j < w; j++, pRow += 4) {
+ pRow[0] =
+ pRow[1] =
+ pRow[2] =
+ pRow[3] = (float)((*src++ >> 24) & 0xff);
+ }
+
+ p += dst_stride;
+ }
+}
+
+/*** PIPE_FORMAT_X24S8_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255]. Z part ignored.
+ */
+static void
+x24s8_get_tile_rgba(const unsigned *src,
+ unsigned w, unsigned h,
+ float *p,
+ unsigned dst_stride)
+{
+ unsigned i, j;
+
+ for (i = 0; i < h; i++) {
+ float *pRow = p;
+ for (j = 0; j < w; j++, pRow += 4) {
+ pRow[0] =
+ pRow[1] =
+ pRow[2] =
+ pRow[3] = (float)(*src++ & 0xff);
+ }
+ p += dst_stride;
+ }
+}
+
+
+/**
+ * Return S component as four uint32_t in [0..255]. Z part ignored.
+ */
+static void
+s8_get_tile_rgba(const unsigned char *src,
+ unsigned w, unsigned h,
+ float *p,
+ unsigned dst_stride)
+{
+ unsigned i, j;
+
+ for (i = 0; i < h; i++) {
+ float *pRow = p;
+ for (j = 0; j < w; j++, pRow += 4) {
+ pRow[0] =
+ pRow[1] =
+ pRow[2] =
+ pRow[3] = (float)(*src++ & 0xff);
+ }
+ p += dst_stride;
+ }
+}
/*** PIPE_FORMAT_Z32_FLOAT ***/
@@ -261,10 +336,19 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
case PIPE_FORMAT_Z24X8_UNORM:
s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
break;
+ case PIPE_FORMAT_S8_USCALED:
+ s8_get_tile_rgba((unsigned char *) src, w, h, dst, dst_stride);
+ break;
+ case PIPE_FORMAT_X24S8_USCALED:
+ s8x24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+ break;
case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
case PIPE_FORMAT_X8Z24_UNORM:
z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
break;
+ case PIPE_FORMAT_S8X24_USCALED:
+ x24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+ break;
case PIPE_FORMAT_Z32_FLOAT:
z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
break;