diff options
author | Chris Robinson <[email protected]> | 2019-08-21 07:43:28 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2019-08-21 07:43:28 -0700 |
commit | 7ad2ed965c9506476f44a5d51deaa4de6b8557bf (patch) | |
tree | 510503b00858ec42fffbe06352f2c595f7c589d5 | |
parent | 024d5d900aa7f9e3435459b0a8cfe354dc8e21fd (diff) |
Avoid reading from pointers to __m128 values
-rw-r--r-- | alc/mixer/mixer_neon.cpp | 42 | ||||
-rw-r--r-- | alc/mixer/mixer_sse.cpp | 29 |
2 files changed, 31 insertions, 40 deletions
diff --git a/alc/mixer/mixer_neon.cpp b/alc/mixer/mixer_neon.cpp index 7b9f7d4e..991443c9 100644 --- a/alc/mixer/mixer_neon.cpp +++ b/alc/mixer/mixer_neon.cpp @@ -76,13 +76,9 @@ template<> const ALfloat *Resample_<BSincTag,NEONTag>(const InterpState *state, const ALfloat *RESTRICT src, ALsizei frac, ALint increment, const al::span<float> dst) { - const ALfloat *const filter = state->bsinc.filter; - const float32x4_t sf4 = vdupq_n_f32(state->bsinc.sf); - const ALsizei m = state->bsinc.m; - const float32x4_t *fil, *scd, *phd, *spd; - ALsizei pi, j, offset; - float32x4_t r4; - ALfloat pf; + const ALfloat *const filter{state->bsinc.filter}; + const float32x4_t sf4{vdupq_n_f32(state->bsinc.sf)}; + const ALsizei m{state->bsinc.m}; ASSUME(m > 0); ASSUME(increment > 0); @@ -93,34 +89,32 @@ const ALfloat *Resample_<BSincTag,NEONTag>(const InterpState *state, const ALflo { // Calculate the phase index and factor. #define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) - pi = frac >> FRAC_PHASE_BITDIFF; - pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF)); + const ALsizei pi{frac >> FRAC_PHASE_BITDIFF}; + const ALfloat pf{(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF))}; #undef FRAC_PHASE_BITDIFF - offset = m*pi*4; - fil = (const float32x4_t*)(filter + offset); offset += m; - scd = (const float32x4_t*)(filter + offset); offset += m; - phd = (const float32x4_t*)(filter + offset); offset += m; - spd = (const float32x4_t*)(filter + offset); - // Apply the scale and phase interpolated filter. r4 = vdupq_n_f32(0.0f); { const ALsizei count = m >> 2; const float32x4_t pf4 = vdupq_n_f32(pf); + const float *fil{filter + m*pi*4}; + const float *scd{fil + m}; + const float *phd{scd + m}; + const float *spd{phd + m}; + ALsizei td{m >> 2}; + size_t j{0u}; - ASSUME(count > 0); - - for(j = 0;j < count;j++) - { + do { /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */ const float32x4_t f4 = vmlaq_f32( - vmlaq_f32(fil[j], sf4, scd[j]), - pf4, vmlaq_f32(phd[j], sf4, spd[j]) - ); + vmlaq_f32(vld1q_f32(fil), sf4, vld1q_f32(scd)), + pf4, vmlaq_f32(vld1q_f32(phd), sf4, vld1q_f32(spd))); + fil += 4; scd += 4; phd += 4; spd += 4; /* r += f*src */ - r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j*4])); - } + r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j])); + j += 4; + } while(--td); } r4 = vaddq_f32(r4, vcombine_f32(vrev64_f32(vget_high_f32(r4)), vrev64_f32(vget_low_f32(r4)))); diff --git a/alc/mixer/mixer_sse.cpp b/alc/mixer/mixer_sse.cpp index 36690a2e..8d2abc88 100644 --- a/alc/mixer/mixer_sse.cpp +++ b/alc/mixer/mixer_sse.cpp @@ -34,31 +34,28 @@ const ALfloat *Resample_<BSincTag,SSETag>(const InterpState *state, const ALfloa const ALfloat pf{(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF))}; #undef FRAC_PHASE_BITDIFF - ALsizei offset{m*pi*4}; - const __m128 *fil{reinterpret_cast<const __m128*>(filter + offset)}; offset += m; - const __m128 *scd{reinterpret_cast<const __m128*>(filter + offset)}; offset += m; - const __m128 *phd{reinterpret_cast<const __m128*>(filter + offset)}; offset += m; - const __m128 *spd{reinterpret_cast<const __m128*>(filter + offset)}; - // Apply the scale and phase interpolated filter. __m128 r4{_mm_setzero_ps()}; { - const ALsizei count{m >> 2}; const __m128 pf4{_mm_set1_ps(pf)}; - - ASSUME(count > 0); + const float *fil{filter + m*pi*4}; + const float *scd{fil + m}; + const float *phd{scd + m}; + const float *spd{phd + m}; + ALsizei td{m >> 2}; + size_t j{0u}; #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) - for(ALsizei j{0};j < count;j++) - { + do { /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */ const __m128 f4 = MLA4( - MLA4(fil[j], sf4, scd[j]), - pf4, MLA4(phd[j], sf4, spd[j]) - ); + MLA4(_mm_load_ps(fil), sf4, _mm_load_ps(scd)), + pf4, MLA4(_mm_load_ps(phd), sf4, _mm_load_ps(spd))); + fil += 4; scd += 4; phd += 4; spd += 4; /* r += f*src */ - r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j*4])); - } + r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j])); + j += 4; + } while(--td); #undef MLA4 } r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); |