diff options
author | Chris Robinson <[email protected]> | 2019-09-26 19:24:29 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2019-09-26 19:24:29 -0700 |
commit | 9b64e5e0db2a778313bb873bb38f481ce40efe31 (patch) | |
tree | 4ac057e992a8720abf57e1ce548d4bda1a750f42 /alc/mixer/mixer_neon.cpp | |
parent | d50ca464cd5e4f07bc399fd244578b0b34d72aef (diff) |
Implement a "fast" bsinc path
This takes advantage of the fact than when increment <= 1 (when not down-
sampling), the scale factor is always 0. As a result, the scale and scale-phase
deltas never contribute to the filtered output. Removing those multiply+add
operations cuts half of the work done by the inner loop.
Sounds that do need to down-sample (when played with a high pitch, or is 48khz
on 44.1khz output, for example), still go through the normal bsinc process.
Diffstat (limited to 'alc/mixer/mixer_neon.cpp')
-rw-r--r-- | alc/mixer/mixer_neon.cpp | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/alc/mixer/mixer_neon.cpp b/alc/mixer/mixer_neon.cpp index cd2b0ebc..178c7d6e 100644 --- a/alc/mixer/mixer_neon.cpp +++ b/alc/mixer/mixer_neon.cpp @@ -118,6 +118,50 @@ const ALfloat *Resample_<BSincTag,NEONTag>(const InterpState *state, const ALflo return dst.begin(); } +template<> +const ALfloat *Resample_<FastBSincTag,NEONTag>(const InterpState *state, + const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst) +{ + const ALfloat *const filter{state->bsinc.filter}; + const size_t m{state->bsinc.m}; + + src -= state->bsinc.l; + for(float &out_sample : dst) + { + // Calculate the phase index and factor. +#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) + const ALuint pi{frac >> FRAC_PHASE_BITDIFF}; + const ALfloat pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * + (1.0f/(1<<FRAC_PHASE_BITDIFF))}; +#undef FRAC_PHASE_BITDIFF + + // Apply the phase interpolated filter. + float32x4_t r4{vdupq_n_f32(0.0f)}; + { + const float32x4_t pf4{vdupq_n_f32(pf)}; + const float *fil{filter + m*pi*4}; + const float *phd{fil + m*2}; + size_t td{m >> 2}; + size_t j{0u}; + + do { + /* f = fil + pf*phd */ + const float32x4_t f4 = vmlaq_f32(vld1q_f32(fil), pf4, vld1q_f32(phd)); + /* r += f*src */ + r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j])); + fil += 4; phd += 4; j += 4; + } while(--td); + } + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + + frac += increment; + src += frac>>FRACTIONBITS; + frac &= FRACTIONMASK; + } + return dst.begin(); +} + static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize, const HrirArray &Coeffs, const ALfloat left, const ALfloat right) |