From db599e316a2d181cf4f7f0c364a7d9eb8145243d Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 25 Jan 2017 12:27:41 -0600 Subject: swr: [rasterizer core] Frontend SIMD16 WIP Widen simdvertex to SIMD16/simd16vertex in frontend for passing VS attributes from VS to PA. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/frontend.cpp | 33 ++- src/gallium/drivers/swr/rasterizer/core/frontend.h | 8 + src/gallium/drivers/swr/rasterizer/core/pa.h | 64 +++++ src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 280 ++++++++++----------- 4 files changed, 243 insertions(+), 142 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index b005ead0d15..4d04d8ace5d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1307,12 +1307,14 @@ void ProcessDraw( pvCutIndices_hi = &pa.GetNextVsIndices(); } - simdvertex &vout_lo = pa.GetNextVsOutput_simd16_lo(); - simdvertex &vout_hi = pa.GetNextVsOutput_simd16_hi(); + simdvertex vout_lo; + simdvertex vout_hi; vsContext_lo.pVout = &vout_lo; vsContext_hi.pVout = &vout_hi; + simd16vertex &vout = pa.GetNextVsOutput_simd16(); + if (i < endVertex) { // 1. Execute FS/VS for a single SIMD. @@ -1347,9 +1349,36 @@ void ProcessDraw( { AR_BEGIN(FEVertexShader, pDC->drawId); state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); + + // copy SIMD vout_lo to lo part of SIMD16 vout + { + const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes; + + for (uint32_t i = 0; i < voutNumSlots; i += 1) + { + for (uint32_t j = 0; j < 4; j += 1) + { + vout.attrib[i][j].lo = vout_lo.attrib[i][j]; + } + } + } + if ((i + KNOB_SIMD_WIDTH) < endVertex) { state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi); + + // copy SIMD vout_hi to hi part of SIMD16 vout + { + const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes; + + for (uint32_t i = 0; i < voutNumSlots; i += 1) + { + for (uint32_t j = 0; j < 4; j += 1) + { + vout.attrib[i][j].hi = vout_hi.attrib[i][j]; + } + } + } } AR_END(FEVertexShader, 0); diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 6d5f6a31b8e..58d6901a819 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -30,6 +30,14 @@ #include "context.h" #include +#if ENABLE_AVX512_SIMD16 +// TODO: this belongs in state.h alongside the simdvector definition, but there is a llvm codegen issue +struct simd16vertex +{ + simd16vector attrib[KNOB_NUM_ATTRIBUTES]; +}; + +#endif // Calculates the A and B coefficients for the 3 edges of the triangle // // maths for edge equations: diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 826032ad54e..7319c56b4d5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -51,6 +51,9 @@ struct PA_STATE virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; +#if ENABLE_AVX512_SIMD16 + virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0; +#endif virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; #if ENABLE_AVX512_SIMD16 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0; @@ -61,6 +64,7 @@ struct PA_STATE #if ENABLE_AVX512_SIMD16 virtual simdvertex& GetNextVsOutput_simd16_lo() = 0; virtual simdvertex& GetNextVsOutput_simd16_hi() = 0; + virtual simd16vertex& GetNextVsOutput_simd16() = 0; #endif virtual bool GetNextStreamOutput() = 0; virtual simdmask& GetNextVsIndices() = 0; @@ -151,6 +155,14 @@ struct PA_STATE_OPT : public PA_STATE return pVertex[index].attrib[slot]; } +#if ENABLE_AVX512_SIMD16 + simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) + { + simd16vertex* pVertex = (simd16vertex*)pStreamBase; + return pVertex[index].attrib[slot]; + } + +#endif // Assembles 4 triangles. Each simdvector is a single vertex from 4 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. bool Assemble(uint32_t slot, simdvector verts[]) @@ -245,6 +257,17 @@ struct PA_STATE_OPT : public PA_STATE return pVertex[this->cur * 2 + 1]; } + simd16vertex& GetNextVsOutput_simd16() + { + // increment cur and prev indices + const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH; + this->prev = this->cur; // prev is undefined for first state. + this->cur = this->counter % numSimdVerts; + + simd16vertex* pVertex = (simd16vertex*)pStreamBase; + return pVertex[this->cur]; + } + #endif simdmask& GetNextVsIndices() { @@ -375,6 +398,13 @@ INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) return pa.GetSimdVector(index, slot); } +#if ENABLE_AVX512_SIMD16 +INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot) +{ + return pa.GetSimdVector_simd16(index, slot); +} + +#endif INLINE __m128 swizzleLane0(const simdvector &a) { simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); @@ -561,6 +591,14 @@ struct PA_STATE_CUT : public PA_STATE return ((simdvertex*)pStreamBase)[vertexIndex * 2 + 1]; } + simd16vertex& GetNextVsOutput_simd16() + { + uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH; + this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts; + this->needOffsets = true; + return ((simd16vertex*)pStreamBase)[vertexIndex]; + } + #endif simdmask& GetNextVsIndices() { @@ -576,6 +614,16 @@ struct PA_STATE_CUT : public PA_STATE return this->tmpVertex.attrib[0]; } +#if ENABLE_AVX512_SIMD16 + simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) + { + // unused + SWR_ASSERT(0 && "Not implemented"); + static simd16vector junk; + return junk; + } + +#endif bool GetNextStreamOutput() { this->headVertex += KNOB_SIMD_WIDTH; @@ -1191,6 +1239,15 @@ struct PA_TESS : PA_STATE return junk; } +#if ENABLE_AVX512_SIMD16 + simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) + { + SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); + static simd16vector junk; + return junk; + } + +#endif static simdscalari GenPrimMask(uint32_t numPrims) { SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); @@ -1344,6 +1401,13 @@ struct PA_TESS : PA_STATE return junk; } + simd16vertex& GetNextVsOutput_simd16() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + static simd16vertex junk; + return junk; + } + #endif bool GetNextStreamOutput() { diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index e2ae962b122..eec824703df 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -265,13 +265,13 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) for (int i = 0; i < 4; ++i) { v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); - v0[i] = _mm256_permutevar8x32_ps(v0[i], perm0); + v0[i] = _simd_permute_ps(v0[i], perm0); v1[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); - v1[i] = _mm256_permutevar8x32_ps(v1[i], perm1); + v1[i] = _simd_permute_ps(v1[i], perm1); v2[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); - v2[i] = _mm256_permutevar8x32_ps(v2[i], perm2); + v2[i] = _simd_permute_ps(v2[i], perm2); } #endif @@ -295,94 +295,14 @@ bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { -#if 0 - const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0); - const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1); - const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2); - - simd16vector &v0 = verts[0]; - simd16vector &v1 = verts[1]; - simd16vector &v2 = verts[2]; - - { - const simdvector &a = PaGetSimdVector(pa, 0, slot); - const simdvector &b = PaGetSimdVector(pa, 1, slot); - const simdvector &c = PaGetSimdVector(pa, 2, slot); - - // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 - // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 - // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 - - // for simd x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - v0[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); - v0[i].lo = _mm256_permutevar8x32_ps(v0[i].lo, perm0); - - v1[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); - v1[i].lo = _mm256_permutevar8x32_ps(v1[i].lo, perm1); - - v2[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); - v2[i].lo = _mm256_permutevar8x32_ps(v2[i].lo, perm2); - } - } - - { - const simdvector &a = PaGetSimdVector(pa, 3, slot); - const simdvector &b = PaGetSimdVector(pa, 4, slot); - const simdvector &c = PaGetSimdVector(pa, 5, slot); - - // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 - // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 - // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 - - // for simd x, y, z, and w - for (int i = 0; i < 4; i += 1) - { - v0[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); - v0[i].hi = _mm256_permutevar8x32_ps(v0[i].hi, perm0); - - v1[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); - v1[i].hi = _mm256_permutevar8x32_ps(v1[i].hi, perm1); - - v2[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); - v2[i].hi = _mm256_permutevar8x32_ps(v2[i].hi, perm2); - } - } - -#else -#if 1 - const simdvector &a_lo = reinterpret_cast(PaGetSimdVector(pa, 0, slot)); - const simdvector &a_hi = reinterpret_cast(PaGetSimdVector(pa, 1, slot)); - const simdvector &b_lo = reinterpret_cast(PaGetSimdVector(pa, 2, slot)); - const simdvector &b_hi = reinterpret_cast(PaGetSimdVector(pa, 3, slot)); - const simdvector &c_lo = reinterpret_cast(PaGetSimdVector(pa, 4, slot)); - const simdvector &c_hi = reinterpret_cast(PaGetSimdVector(pa, 5, slot)); - - simd16vector a; - simd16vector b; - simd16vector c; - - for (uint32_t i = 0; i < 4; i += 1) - { - a[i].lo = a_lo[i]; - a[i].hi = a_hi[i]; - b[i].lo = b_lo[i]; - b[i].hi = b_hi[i]; - c[i].lo = c_lo[i]; - c[i].hi = c_hi[i]; - } - -#else - const simd16vector &a = reinterpret_cast(PaGetSimdVector(pa, 0 * 2, slot)); - const simd16vector &b = reinterpret_cast(PaGetSimdVector(pa, 1 * 2, slot)); - const simd16vector &c = reinterpret_cast(PaGetSimdVector(pa, 2 * 2, slot)); - -#endif const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0); const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1); const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2); + const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot); + simd16vector &v0 = verts[0]; simd16vector &v1 = verts[1]; simd16vector &v2 = verts[2]; @@ -404,7 +324,6 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) v2[i] = _simd16_permute_ps(v2[i], perm2); } -#endif SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true); return true; } @@ -416,13 +335,29 @@ void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m12 // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. #if ENABLE_AVX512_SIMD16 - const uint32_t i0 = pa.useAlternateOffset ? 3 : 0; - const uint32_t i1 = pa.useAlternateOffset ? 4 : 1; - const uint32_t i2 = pa.useAlternateOffset ? 5 : 2; + const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot); + + simdvector a; + simdvector b; + simdvector c; - simdvector& a = PaGetSimdVector(pa, i0, slot); - simdvector& b = PaGetSimdVector(pa, i1, slot); - simdvector& c = PaGetSimdVector(pa, i2, slot); + for (uint32_t i = 0; i < 4; i += 1) + { + if (pa.useAlternateOffset) + { + a[i] = b_16[i].hi; + b[i] = c_16[i].lo; + c[i] = c_16[i].hi; + } + else + { + a[i] = a_16[i].lo; + b[i] = a_16[i].hi; + c[i] = b_16[i].lo; + } + } #else simdvector& a = PaGetSimdVector(pa, 0, slot); @@ -522,6 +457,39 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) return true; } +#if 0 // ENABLE_AVX512_SIMD16 +bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) +{ + const simd16vector &a = PaGetSimdVector(pa, pa.prev, slot); + const simd16vector &b = PaGetSimdVector(pa, pa.cur, slot); + + simd16vector &v0 = verts[0]; + simd16vector &v1 = verts[1]; + simd16vector &v2 = verts[2]; + + // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF + // v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 + // v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 + + // for simd16 x, y, z, and w + for (int i = 0; i < 4; i += 1) + { + simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3 + simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3 + + simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000); // // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3 + simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2)); // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 + + v0[i] = a[i]; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF + v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1 + v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0 + } + + SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH); + return true; +} + +#endif void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) { simdvector& a = PaGetSimdVector(pa, pa.prev, slot); @@ -1033,8 +1001,8 @@ bool PaRectList1( simdvector verts[]) { // SIMD vectors a and b are the last two vertical outputs from the vertex shader. - simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } - simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } + simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } + simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } __m256 tmp0, tmp1, tmp2; @@ -1042,34 +1010,34 @@ bool PaRectList1( for(int i = 0; i < 4; ++i) { simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. /// AVX2 should make this much cheaper. simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } - v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } + v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } // verts[2] = { v2, w, v5, x, v8, y, v11, z } simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); // Need to compute 4th implied vertex for the rectangle. tmp2 = _mm256_sub_ps(v0[i], v1[i]); - tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } } SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); @@ -1133,44 +1101,60 @@ bool PaRectList1_simd16( uint32_t slot, simd16vector verts[]) { - // SIMD vectors a and b are the last two vertical outputs from the vertex shader. - simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } - simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } + const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 } + const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. } + + simdvector a; + simdvector b; + + for (uint32_t i = 0; i < 4; i += 1) + { + if (pa.useAlternateOffset) + { + a[i] = b_16[i].lo; + b[i] = b_16[i].hi; + } + else + { + a[i] = a_16[i].lo; + b[i] = a_16[i].hi; + } + } __m256 tmp0, tmp1, tmp2; // Loop over each component in the simdvector. for (int i = 0; i < 4; i += 1) { - simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } - tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } - - /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. - /// AVX2 should make this much cheaper. - simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } - tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + + /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. + /// AVX2 should make this much cheaper. + simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } - tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } - v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } - - // verts[2] = { v2, w, v5, x, v8, y, v11, z } - simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } - tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } + v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + + // verts[2] = { v2, w, v5, x, v8, y, v11, z } + simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0); // Need to compute 4th implied vertex for the rectangle. tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo); tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * } - tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } v0[i].hi = _simd_setzero_ps(); v1[i].hi = _simd_setzero_ps(); @@ -1218,9 +1202,25 @@ void PaRectListSingle0( // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. #if ENABLE_AVX512_SIMD16 - const uint32_t i0 = pa.useAlternateOffset ? 3 : 0; + const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); + const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); + + simdvector a; + simdvector b; - simdvector& a = PaGetSimdVector(pa, i0, slot); + for (uint32_t i = 0; i < 4; i += 1) + { + if (pa.useAlternateOffset) + { + a[i] = b_16[i].lo; + b[i] = b_16[i].hi; + } + else + { + a[i] = a_16[i].lo; + b[i] = a_16[i].hi; + } + } #else simdvector& a = PaGetSimdVector(pa, 0, slot); -- cgit v1.2.3