summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/swr')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h51
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib.hpp333
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl16
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl15
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp96
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp2
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp2
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.cpp1
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp5
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp10
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h3
17 files changed, 49 insertions, 492 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index df5c3ac6056..ebb4f4b7f11 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -191,57 +191,6 @@ SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD12
SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
}
-SIMDINLINE
-void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane)
-{
- OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
- SIMD256::store_ps(rArray, r);
- SIMD256::store_ps(sArray, s);
- rArray[rlane] = sArray[slane];
- r = SIMD256::load_ps(rArray);
-}
-
-// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-#define _simdvec_load_ps SIMD::vec4_load1_ps
-
-SIMDINLINE
-void _simdvec_mov(simdvector& r, const simdscalar& s)
-{
- SIMD::vec4_set1_vps(r, s);
-}
-
-SIMDINLINE
-void _simdvec_mov(simdvector& r, const simdvector& v)
-{
- r = v;
-}
-
-#if 0
-// just move a lane from the source simdvector to dest simdvector
-SIMDINLINE
-void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
-{
- _simd_mov(r[0], rlane, s[0], slane);
- _simd_mov(r[1], rlane, s[1], slane);
- _simd_mov(r[2], rlane, s[2], slane);
- _simd_mov(r[3], rlane, s[3], slane);
-}
-
-#endif
-
-#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
-#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
-#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
-#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
-#define _simdvec_mul_ps SIMD::vec4_mul_ps
-#define _simdvec_add_ps SIMD::vec4_add_ps
-#define _simdvec_min_ps SIMD::vec4_min_ps
-#define _simdvec_max_ps SIMD::vec4_max_ps
-#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
-#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
-#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
-#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
-
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 153e2af7eae..53793ba101c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -209,339 +209,6 @@ struct SIMDBase : Traits::IsaImpl
using Integer = typename Traits::Integer;
using Vec4 = typename Traits::Vec4;
using Mask = typename Traits::Mask;
-
- static const size_t VECTOR_BYTES = sizeof(Float);
-
- // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
- static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
- {
- r[0] = SIMD::set1_ps(p[0]);
- r[1] = SIMD::set1_ps(p[1]);
- r[2] = SIMD::set1_ps(p[2]);
- r[3] = SIMD::set1_ps(p[3]);
- }
-
- static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
- {
- r[0] = s;
- r[1] = s;
- r[2] = s;
- r[3] = s;
- }
-
- static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
- {
- Float tmp, r;
- r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
-
- tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
-
- tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
- return r;
- }
-
- static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
- {
- Float tmp, r;
- r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
-
- tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
-
- tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
- tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
- r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
- return r;
- }
-
- static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
- {
- Float length = vec4_dp4_ps(v, v);
- return SIMD::rsqrt_ps(length);
- }
-
- static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
- {
- Float rcpLength = vec4_rcp_length_ps(v);
-
- r[0] = SIMD::mul_ps(v[0], rcpLength);
- r[1] = SIMD::mul_ps(v[1], rcpLength);
- r[2] = SIMD::mul_ps(v[2], rcpLength);
- r[3] = SIMD::mul_ps(v[3], rcpLength);
- }
-
- static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
- {
- r[0] = SIMD::mul_ps(v[0], s);
- r[1] = SIMD::mul_ps(v[1], s);
- r[2] = SIMD::mul_ps(v[2], s);
- r[3] = SIMD::mul_ps(v[3], s);
- }
-
- static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
- {
- r[0] = SIMD::mul_ps(v0[0], v1[0]);
- r[1] = SIMD::mul_ps(v0[1], v1[1]);
- r[2] = SIMD::mul_ps(v0[2], v1[2]);
- r[3] = SIMD::mul_ps(v0[3], v1[3]);
- }
-
- static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
- {
- r[0] = SIMD::add_ps(v0[0], s);
- r[1] = SIMD::add_ps(v0[1], s);
- r[2] = SIMD::add_ps(v0[2], s);
- r[3] = SIMD::add_ps(v0[3], s);
- }
-
- static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
- {
- r[0] = SIMD::add_ps(v0[0], v1[0]);
- r[1] = SIMD::add_ps(v0[1], v1[1]);
- r[2] = SIMD::add_ps(v0[2], v1[2]);
- r[3] = SIMD::add_ps(v0[3], v1[3]);
- }
-
- static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
- {
- r[0] = SIMD::min_ps(v0[0], s);
- r[1] = SIMD::min_ps(v0[1], s);
- r[2] = SIMD::min_ps(v0[2], s);
- r[3] = SIMD::min_ps(v0[3], s);
- }
-
- static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
- {
- r[0] = SIMD::max_ps(v0[0], s);
- r[1] = SIMD::max_ps(v0[1], s);
- r[2] = SIMD::max_ps(v0[2], s);
- r[3] = SIMD::max_ps(v0[3], s);
- }
-
- // Matrix4x4 * Vector4
- // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
- // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
- // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
- // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
- static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result,
- const float* pMatrix,
- const Vec4& v)
- {
- Float m;
- Float r0;
- Float r1;
-
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
- r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[0] = r0;
-
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
- r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[1] = r0;
-
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
- r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[2] = r0;
-
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
- r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[3] = r0;
- }
-
- // Matrix4x4 * Vector3 - Direction Vector where w = 0.
- // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
- // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
- // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
- // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
- static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result,
- const float* pMatrix,
- const Vec4& v)
- {
- Float m;
- Float r0;
- Float r1;
-
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[0] = r0;
-
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[1] = r0;
-
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[2] = r0;
-
- result[3] = SIMD::setzero_ps();
- }
-
- // Matrix4x4 * Vector3 - Position vector where w = 1.
- // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
- // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
- // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
- // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
- static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result,
- const float* pMatrix,
- const Vec4& v)
- {
- Float m;
- Float r0;
- Float r1;
-
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
-
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
- result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- }
-
- static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result,
- const float* pMatrix,
- const Vec4& v)
- {
- Float m;
- Float r0;
- Float r1;
-
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
- r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
- r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
- r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
- r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
- r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
- result[3] = SIMD::set1_ps(1.0f);
- }
}; // struct SIMDBase
using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
index 232f43faec7..b5046e48683 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -222,14 +222,14 @@ SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
-SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
-SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
-SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
-SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
-SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
-SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
-SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int)
//-----------------------------------------------------------------------
// Shift operations
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
index 49650d52442..8fce96dcea4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -81,6 +81,7 @@
return _mm256_##op(a, b, ImmT); \
}
+
//-----------------------------------------------------------------------
// Floating point arithmetic operations
//-----------------------------------------------------------------------
@@ -116,7 +117,14 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
+#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version]
+// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly.
+// Using and_ps instead inhibits the compiler's constant folding and actually issues
+// the and intrinsic even though both inputs are constant values.
+#else
+// Use native integer and intrinsic
+SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
+#endif
SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
@@ -213,6 +221,10 @@ static SIMDINLINE Float SIMDCALL
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
}
+#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version]
+// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register
+// correctly in early versions of MSVC 2019
+#else
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template <ScaleFactor ScaleT = ScaleFactor::SF_1>
static SIMDINLINE Float SIMDCALL
@@ -222,6 +234,7 @@ static SIMDINLINE Float SIMDCALL
// Only for this intrinsic - not sure why. :(
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
}
+#endif
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
{
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
index 85c722c92c0..3d31b39ee55 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -328,101 +328,5 @@ struct SIMD256 // or SIMD4 or SIMD16
//=======================================================================
// Advanced masking interface (currently available only in SIMD16 width)
//=======================================================================
-
-
- //=======================================================================
- // Extended Utility Functions (common to SIMD256 and SIMD16)
- //=======================================================================
-
- //-----------------------------------------------------------------------
- // Extended Types
- //-----------------------------------------------------------------------
-
- // Vec4, an SOA SIMD set of 4-dimensional vectors
- union Vec4
- {
- Vec4() = default;
- Vec4(Float in)
- {
- s.x = in;
- s.y = in;
- s.z = in;
- s.w = in;
- }
- Vec4(Float x, Float y, Float z, Float w)
- {
- s.x = x;
- s.y = y;
- s.z = z;
- s.w = w;
- }
-
- Float v[4];
- Integer vi[4];
- struct
- {
- Float x;
- Float y;
- Float z;
- Float w;
- } s;
- Float& operator[] (const int i) { return v[i]; }
- Float const & operator[] (const int i) const { return v[i]; }
- };
-
- //-----------------------------------------------------------------------
- // Extended Functions
- //-----------------------------------------------------------------------
- static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
- static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
- static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
- static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
- static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
- static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
- static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
- static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
- static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
- static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
- static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
-
- // Matrix4x4 * Vector4
- // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
- // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
- // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
- // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
- static void mat4x4_vec4_multiply(
- Vec4& result,
- const float *pMatrix,
- const Vec4& v);
-
- // Matrix4x4 * Vector3 - Direction Vector where w = 0.
- // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
- // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
- // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
- // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
- static void mat3x3_vec3_w0_multiply(
- Vec4& result,
- const float *pMatrix,
- const Vec4& v);
-
- // Matrix4x4 * Vector3 - Position vector where w = 1.
- // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
- // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
- // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
- // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
- static void mat4x4_vec3_w1_multiply(
- Vec4& result,
- const float *pMatrix,
- const Vec4& v);
-
- // Matrix4x3 * Vector3 - Position vector where w = 1.
- // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
- // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
- // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
- // result.s.w = 1
- static void mat4x3_vec3_w1_multiply(
- Vec4& result,
- const float *pMatrix,
- const Vec4& v);
};
#endif // #if 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
index 944c3c23fd3..3ef847d4ca4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -315,7 +315,7 @@ namespace SIMDImpl
namespace SIMD512Impl
{
-#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
+#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3601aa3f509..f1b0dc03352 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1551,6 +1551,7 @@ void SwrDispatch(HANDLE hContext,
pTaskData->threadGroupCountX = threadGroupCountX;
pTaskData->threadGroupCountY = threadGroupCountY;
pTaskData->threadGroupCountZ = threadGroupCountZ;
+ pTaskData->enableThreadDispatch = false;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index e1ba893296e..8058defb388 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -588,6 +588,7 @@ SWR_FUNC(void,
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ);
+
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
enum SWR_TILE_STATE
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 5a8656dcfba..8891cc881a3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -140,6 +140,7 @@ struct COMPUTE_DESC
uint32_t threadGroupCountX;
uint32_t threadGroupCountY;
uint32_t threadGroupCountZ;
+ bool enableThreadDispatch;
};
typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 635bf195e4b..c41376ae97b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1447,7 +1447,7 @@ struct PA_TESS : PA_STATE
SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
if (!m_SOA)
{
- indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
+ indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
}
#else
SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index fc8dc46d9de..0f78bd661a5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -584,7 +584,7 @@ struct JitCacheFileHeader
uint64_t GetObjectCRC() const { return m_objCRC; }
private:
- static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 6;
+ static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7;
static const size_t JC_STR_MAX_LEN = 32;
static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) |
(LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) |
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 53f11d66db1..30481b43208 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -61,6 +61,7 @@ namespace SwrJit
mInt16PtrTy = PointerType::get(mInt16Ty, 0);
mInt32PtrTy = PointerType::get(mInt32Ty, 0);
mInt64PtrTy = PointerType::get(mInt64Ty, 0);
+ mHandleTy = mInt8PtrTy;
mSimd4FP64Ty = VectorType::get(mDoubleTy, 4);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 97550fad23d..6e1d94b9e68 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -78,6 +78,7 @@ namespace SwrJit
// Built in types: scalar
Type* mVoidTy;
+ Type* mHandleTy;
Type* mInt1Ty;
Type* mInt8Ty;
Type* mInt16Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index b183a9e0082..2d8240187c5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -237,6 +237,11 @@ namespace SwrJit
return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
}
+ void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
+ {
+ MASKED_SCATTER(pVecSrc, pVecDstPtr, 4, pVecMask);
+ }
+
void Builder::Gather4(const SWR_FORMAT format,
Value* pSrcBase,
Value* byteOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
index 934a8279c2f..fe4c5dd38a4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -148,6 +148,7 @@ void GATHER4DD(const SWR_FORMAT_INFO& info,
Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
+void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask);
virtual void SCATTERPS(Value* pDst,
Value* vSrc,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5b06de352dc..6687ead02d3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -170,6 +170,16 @@ namespace SwrJit
return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
}
+ Value* Builder::VIMMED1(uint64_t i)
+ {
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+ }
+
+ Value* Builder::VIMMED1_16(uint64_t i)
+ {
+ return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
+ }
+
Value* Builder::VIMMED1(int i)
{
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 91e2a32f1a1..3987a5f3476 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -71,6 +71,9 @@ Constant* CInc(uint32_t base, uint32_t count)
Constant* PRED(bool pred);
+Value* VIMMED1(uint64_t i);
+Value* VIMMED1_16(uint64_t i);
+
Value* VIMMED1(int i);
Value* VIMMED1_16(int i);