From 9b64e5e0db2a778313bb873bb38f481ce40efe31 Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Thu, 26 Sep 2019 19:24:29 -0700 Subject: Implement a "fast" bsinc path This takes advantage of the fact than when increment <= 1 (when not down- sampling), the scale factor is always 0. As a result, the scale and scale-phase deltas never contribute to the filtered output. Removing those multiply+add operations cuts half of the work done by the inner loop. Sounds that do need to down-sample (when played with a high pitch, or is 48khz on 44.1khz output, for example), still go through the normal bsinc process. --- alc/mixer/defs.h | 3 ++- alc/mixer/mixer_c.cpp | 25 +++++++++++++++++++++++++ alc/mixer/mixer_neon.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ alc/mixer/mixer_sse.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 1 deletion(-) (limited to 'alc/mixer') diff --git a/alc/mixer/defs.h b/alc/mixer/defs.h index 62e7d3ba..ce572973 100644 --- a/alc/mixer/defs.h +++ b/alc/mixer/defs.h @@ -23,7 +23,8 @@ enum ResampleType { PointTag, LerpTag, CubicTag, - BSincTag + BSincTag, + FastBSincTag }; template diff --git a/alc/mixer/mixer_c.cpp b/alc/mixer/mixer_c.cpp index 42d515ae..fafda70d 100644 --- a/alc/mixer/mixer_c.cpp +++ b/alc/mixer/mixer_c.cpp @@ -41,6 +41,26 @@ inline ALfloat do_bsinc(const InterpState &istate, const ALfloat *RESTRICT vals, r += (fil[j_f] + istate.bsinc.sf*scd[j_f] + pf*(phd[j_f] + istate.bsinc.sf*spd[j_f])) * vals[j_f]; return r; } +inline ALfloat do_fastbsinc(const InterpState &istate, const ALfloat *RESTRICT vals, const ALuint frac) +{ + const size_t m{istate.bsinc.m}; + + // Calculate the phase index and factor. +#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) + const ALuint pi{frac >> FRAC_PHASE_BITDIFF}; + const ALfloat pf{static_cast(frac & ((1< @@ -98,6 +118,11 @@ const ALfloat *Resample_(const InterpState *state, const ALfloat ALuint frac, ALuint increment, const al::span dst) { return DoResample(state, src-state->bsinc.l, frac, increment, dst); } +template<> +const ALfloat *Resample_(const InterpState *state, const ALfloat *RESTRICT src, + ALuint frac, ALuint increment, const al::span dst) +{ return DoResample(state, src-state->bsinc.l, frac, increment, dst); } + static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize, const HrirArray &Coeffs, const ALfloat left, const ALfloat right) diff --git a/alc/mixer/mixer_neon.cpp b/alc/mixer/mixer_neon.cpp index cd2b0ebc..178c7d6e 100644 --- a/alc/mixer/mixer_neon.cpp +++ b/alc/mixer/mixer_neon.cpp @@ -118,6 +118,50 @@ const ALfloat *Resample_(const InterpState *state, const ALflo return dst.begin(); } +template<> +const ALfloat *Resample_(const InterpState *state, + const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span dst) +{ + const ALfloat *const filter{state->bsinc.filter}; + const size_t m{state->bsinc.m}; + + src -= state->bsinc.l; + for(float &out_sample : dst) + { + // Calculate the phase index and factor. +#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) + const ALuint pi{frac >> FRAC_PHASE_BITDIFF}; + const ALfloat pf{static_cast(frac & ((1<> 2}; + size_t j{0u}; + + do { + /* f = fil + pf*phd */ + const float32x4_t f4 = vmlaq_f32(vld1q_f32(fil), pf4, vld1q_f32(phd)); + /* r += f*src */ + r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j])); + fil += 4; phd += 4; j += 4; + } while(--td); + } + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + + frac += increment; + src += frac>>FRACTIONBITS; + frac &= FRACTIONMASK; + } + return dst.begin(); +} + static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize, const HrirArray &Coeffs, const ALfloat left, const ALfloat right) diff --git a/alc/mixer/mixer_sse.cpp b/alc/mixer/mixer_sse.cpp index 9bb3bb8a..002d6064 100644 --- a/alc/mixer/mixer_sse.cpp +++ b/alc/mixer/mixer_sse.cpp @@ -66,6 +66,53 @@ const ALfloat *Resample_(const InterpState *state, const ALfloa return dst.begin(); } +template<> +const ALfloat *Resample_(const InterpState *state, + const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span dst) +{ + const ALfloat *const filter{state->bsinc.filter}; + const size_t m{state->bsinc.m}; + + src -= state->bsinc.l; + for(float &out_sample : dst) + { + // Calculate the phase index and factor. +#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) + const ALuint pi{frac >> FRAC_PHASE_BITDIFF}; + const ALfloat pf{static_cast(frac & ((1<> 2}; + size_t j{0u}; + +#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) + do { + /* f = fil + pf*phd */ + const __m128 f4 = MLA4(_mm_load_ps(fil), pf4, _mm_load_ps(phd)); + /* r += f*src */ + r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j])); + fil += 4; phd += 4; j += 4; + } while(--td); +#undef MLA4 + } + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + out_sample = _mm_cvtss_f32(r4); + + frac += increment; + src += frac>>FRACTIONBITS; + frac &= FRACTIONMASK; + } + return dst.begin(); +} + static inline void ApplyCoeffs(size_t Offset, float2 *RESTRICT Values, const ALuint IrSize, const HrirArray &Coeffs, const ALfloat left, const ALfloat right) -- cgit v1.2.3